summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vpx_dsp
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vpx_dsp')
-rw-r--r--media/libvpx/libvpx/vpx_dsp/add_noise.c74
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c237
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c65
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c480
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c439
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h318
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c419
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h2919
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c85
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h105
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c143
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h307
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h542
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c168
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c158
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c140
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c64
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c215
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c1361
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c640
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c757
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c625
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c88
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c89
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c371
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h474
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c2514
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c776
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c305
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c273
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c408
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c586
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c509
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c931
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c183
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c113
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c58
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c77
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c764
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c674
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c58
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c513
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c776
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm66
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c47
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm188
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c59
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c65
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c59
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm46
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h919
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c1942
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm630
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm666
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm549
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm491
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c1107
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h443
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c290
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c344
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c570
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm34
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c490
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c137
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h223
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c100
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h1546
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c552
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm438
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm439
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm486
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm487
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm415
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm415
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c2110
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h261
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c41
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h29
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm457
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm455
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c139
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm116
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c100
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm84
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c65
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c320
-rw-r--r--media/libvpx/libvpx/vpx_dsp/avg.c441
-rw-r--r--media/libvpx/libvpx/vpx_dsp/bitreader.c100
-rw-r--r--media/libvpx/libvpx/vpx_dsp/bitreader.h163
-rw-r--r--media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c44
-rw-r--r--media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h47
-rw-r--r--media/libvpx/libvpx/vpx_dsp/bitwriter.c42
-rw-r--r--media/libvpx/libvpx/vpx_dsp/bitwriter.h120
-rw-r--r--media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c43
-rw-r--r--media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h38
-rw-r--r--media/libvpx/libvpx/vpx_dsp/deblock.c196
-rw-r--r--media/libvpx/libvpx/vpx_dsp/fastssim.c498
-rw-r--r--media/libvpx/libvpx/vpx_dsp/fwd_txfm.c809
-rw-r--r--media/libvpx/libvpx/vpx_dsp/fwd_txfm.h25
-rw-r--r--media/libvpx/libvpx/vpx_dsp/intrapred.c917
-rw-r--r--media/libvpx/libvpx/vpx_dsp/inv_txfm.c2701
-rw-r--r--media/libvpx/libvpx/vpx_dsp/inv_txfm.h125
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c90
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c83
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h41
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c1176
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c350
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h381
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c834
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c98
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c1320
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c214
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c458
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h167
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c248
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c717
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c874
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c371
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h48
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c263
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h62
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c972
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c737
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c918
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c814
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c697
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c825
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c321
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c437
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h138
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loopfilter.c743
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c54
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c731
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.c30
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h48
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c256
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c802
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/convolve2_dspr2.c1029
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c681
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c237
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c647
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c998
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c1602
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c878
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c360
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h58
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c742
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c948
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c272
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h364
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c486
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c730
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c99
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c117
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/intrapred16_dspr2.c325
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/intrapred4_dspr2.c225
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/intrapred8_dspr2.c603
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/intrapred_msa.c738
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h75
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h411
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/itrans16_dspr2.c1230
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c1119
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/itrans32_dspr2.c1218
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c375
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/itrans8_dspr2.c690
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c1489
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c147
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c333
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c326
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h734
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h435
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h355
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c588
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c732
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c756
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h177
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h1971
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c807
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c804
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c1789
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c306
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/subtract_msa.c264
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c129
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h101
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c1357
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c622
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c716
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c611
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c684
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c692
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c716
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c1227
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c699
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c234
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c249
-rw-r--r--media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h122
-rw-r--r--media/libvpx/libvpx/vpx_dsp/postproc.h25
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h47
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c374
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c553
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c119
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c767
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c1828
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h48
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c301
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c261
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c117
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h133
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h90
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h108
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c271
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c408
-rw-r--r--media/libvpx/libvpx/vpx_dsp/prob.c47
-rw-r--r--media/libvpx/libvpx/vpx_dsp/prob.h106
-rw-r--r--media/libvpx/libvpx/vpx_dsp/psnr.c262
-rw-r--r--media/libvpx/libvpx/vpx_dsp/psnr.h54
-rw-r--r--media/libvpx/libvpx/vpx_dsp/psnrhvs.c281
-rw-r--r--media/libvpx/libvpx/vpx_dsp/quantize.c321
-rw-r--r--media/libvpx/libvpx/vpx_dsp/quantize.h46
-rw-r--r--media/libvpx/libvpx/vpx_dsp/sad.c256
-rw-r--r--media/libvpx/libvpx/vpx_dsp/skin_detection.c79
-rw-r--r--media/libvpx/libvpx/vpx_dsp/skin_detection.h24
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ssim.c461
-rw-r--r--media/libvpx/libvpx/vpx_dsp/ssim.h87
-rw-r--r--media/libvpx/libvpx/vpx_dsp/subtract.c54
-rw-r--r--media/libvpx/libvpx/vpx_dsp/sum_squares.c26
-rw-r--r--media/libvpx/libvpx/vpx_dsp/txfm_common.h66
-rw-r--r--media/libvpx/libvpx/vpx_dsp/variance.c566
-rw-r--r--media/libvpx/libvpx/vpx_dsp/variance.h88
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_convolve.c537
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_convolve.h38
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk471
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h77
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c15
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl1823
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_filter.h42
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm88
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c482
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c577
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c69
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm130
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h44
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm90
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h56
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/convolve.h279
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h161
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h88
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h112
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm432
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h2930
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h3130
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c399
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h1015
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c272
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h371
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm361
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c1495
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c355
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c349
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c782
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c765
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c160
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c47
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c213
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c210
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c534
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c930
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm453
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h404
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h112
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c1140
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c260
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c155
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c462
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm326
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c522
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm416
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm1021
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm315
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c608
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm860
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm871
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c626
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c1235
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h710
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c364
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h110
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm103
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c913
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c1779
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h154
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c141
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c258
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c291
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c116
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h127
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c232
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h51
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c184
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c83
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm278
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c208
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm332
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm219
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm1467
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c203
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm127
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c105
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h367
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h32
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c872
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c565
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm226
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm964
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm496
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c1161
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c1458
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c1087
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm989
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm803
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm450
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm420
322 files changed, 155711 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vpx_dsp/add_noise.c b/media/libvpx/libvpx/vpx_dsp/add_noise.c
new file mode 100644
index 0000000000..6839e97928
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/add_noise.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/postproc.h"
+#include "vpx_ports/mem.h"
+
+void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
+ int whiteclamp, int width, int height, int pitch) {
+ int i, j;
+ int bothclamp = blackclamp + whiteclamp;
+ for (i = 0; i < height; ++i) {
+ uint8_t *pos = start + i * pitch;
+ const int8_t *ref = (const int8_t *)(noise + (rand() & 0xff)); // NOLINT
+
+ for (j = 0; j < width; ++j) {
+ int v = pos[j];
+
+ v = clamp(v - blackclamp, 0, 255);
+ v = clamp(v + bothclamp, 0, 255);
+ v = clamp(v - whiteclamp, 0, 255);
+
+ pos[j] = v + ref[j];
+ }
+ }
+}
+
+static double gaussian(double sigma, double mu, double x) {
+ return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+ (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+int vpx_setup_noise(double sigma, int8_t *noise, int size) {
+ int8_t char_dist[256];
+ int next = 0, i, j;
+
+ // set up a 256 entry lookup that matches gaussian distribution
+ for (i = -32; i < 32; ++i) {
+ const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
+ if (a_i) {
+ for (j = 0; j < a_i; ++j) {
+ if (next + j >= 256) goto set_noise;
+ char_dist[next + j] = (int8_t)i;
+ }
+ next = next + j;
+ }
+ }
+
+ // Rounding error - might mean we have less than 256.
+ for (; next < 256; ++next) {
+ char_dist[next] = 0;
+ }
+
+set_noise:
+ for (i = 0; i < size; ++i) {
+ noise[i] = char_dist[rand() & 0xff]; // NOLINT
+ }
+
+ // Returns the highest non 0 value used in distribution.
+ return -char_dist[0];
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c
new file mode 100644
index 0000000000..8c61fc26f4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) {
+ const uint8x16_t b = load_unaligned_u8q(a, a_stride);
+ const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
+ return (horizontal_add_uint16x8(c) + (1 << 3)) >> 4;
+}
+
+uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
+ int i;
+ uint8x8_t b, c;
+ uint16x8_t sum;
+ b = vld1_u8(a);
+ a += a_stride;
+ c = vld1_u8(a);
+ a += a_stride;
+ sum = vaddl_u8(b, c);
+
+ for (i = 0; i < 6; ++i) {
+ const uint8x8_t d = vld1_u8(a);
+ a += a_stride;
+ sum = vaddw_u8(sum, d);
+ }
+
+ return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+int vpx_satd_neon(const tran_low_t *coeff, int length) {
+ int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ do {
+ int16x8_t abs0, abs1;
+ const int16x8_t s0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t s1 = load_tran_low_to_s16q(coeff + 8);
+
+ abs0 = vabsq_s16(s0);
+ sum_s32[0] = vpadalq_s16(sum_s32[0], abs0);
+ abs1 = vabsq_s16(s1);
+ sum_s32[1] = vpadalq_s16(sum_s32[1], abs1);
+
+ length -= 16;
+ coeff += 16;
+ } while (length != 0);
+
+ return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1]));
+}
+
+void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+ const int ref_stride, const int height) {
+ int i;
+ uint16x8_t vec_sum_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_hi = vdupq_n_u16(0);
+ const int shift_factor = ((height >> 5) + 3) * -1;
+ const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
+
+ for (i = 0; i < height; i += 8) {
+ const uint8x16_t vec_row1 = vld1q_u8(ref);
+ const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
+ const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
+ const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
+ const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
+ const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
+ const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
+ const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
+
+ ref += ref_stride * 8;
+ }
+
+ vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
+ vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+
+ vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
+ hbuf += 8;
+ vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+}
+
+int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+ int i;
+ uint16x8_t vec_sum = vdupq_n_u16(0);
+
+ for (i = 0; i < width; i += 16) {
+ const uint8x16_t vec_row = vld1q_u8(ref);
+ vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
+ vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
+ ref += 16;
+ }
+
+ return (int16_t)horizontal_add_uint16x8(vec_sum);
+}
+
+// ref, src = [0, 510] - max diff = 16-bits
+// bwl = {2, 3, 4}, width = {16, 32, 64}
+int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
+ int width = 4 << bwl;
+ int32x4_t sse = vdupq_n_s32(0);
+ int16x8_t total = vdupq_n_s16(0);
+
+ assert(width >= 8);
+ assert((width % 8) == 0);
+
+ do {
+ const int16x8_t r = vld1q_s16(ref);
+ const int16x8_t s = vld1q_s16(src);
+ const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits.
+ const int16x4_t diff_lo = vget_low_s16(diff);
+ const int16x4_t diff_hi = vget_high_s16(diff);
+ sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits.
+ sse = vmlal_s16(sse, diff_hi, diff_hi);
+ total = vaddq_s16(total, diff); // dynamic range 16 bits.
+
+ ref += 8;
+ src += 8;
+ width -= 8;
+ } while (width != 0);
+
+ {
+ // Note: 'total''s pairwise addition could be implemented similarly to
+ // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired
+ // with the summation of 'sse' performed better on a Cortex-A15.
+ const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total'
+ const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+ const int32x2_t t2 = vpadd_s32(t1, t1);
+ const int t = vget_lane_s32(t2, 0);
+ const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'.
+ const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+ vreinterpret_s32_s64(vget_high_s64(s0)));
+ const int s = vget_lane_s32(s1, 0);
+ const int shift_factor = bwl + 2;
+ return s - ((t * t) >> shift_factor);
+ }
+}
+
+void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int *min, int *max) {
+ // Load and concatenate.
+ const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
+ const uint8x16_t a23 =
+ vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
+ const uint8x16_t a45 =
+ vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
+ const uint8x16_t a67 =
+ vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
+
+ const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
+ const uint8x16_t b23 =
+ vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
+ const uint8x16_t b45 =
+ vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
+ const uint8x16_t b67 =
+ vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
+
+ // Absolute difference.
+ const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+ const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+ const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+ const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+ // Max values between the Q vectors.
+ const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+ const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+ const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+ const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+#if VPX_ARCH_AARCH64
+ *min = *max = 0; // Clear high bits
+ *((uint8_t *)max) = vmaxvq_u8(ab07_max);
+ *((uint8_t *)min) = vminvq_u8(ab07_min);
+#else
+ // Split into 64-bit vectors and execute pairwise min/max.
+ uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+ uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+ // Enough runs of vpmax/min propagate the max/min values to every position.
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ *min = *max = 0; // Clear high bits
+ // Store directly to avoid costly neon->gpr transfer.
+ vst1_lane_u8((uint8_t *)max, ab_max, 0);
+ vst1_lane_u8((uint8_t *)min, ab_min, 0);
+#endif
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c
new file mode 100644
index 0000000000..5afdece0ab
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ if (width > 8) {
+ int x, y = height;
+ do {
+ for (x = 0; x < width; x += 16) {
+ const uint8x16_t p = vld1q_u8(pred + x);
+ const uint8x16_t r = vld1q_u8(ref + x);
+ const uint8x16_t avg = vrhaddq_u8(p, r);
+ vst1q_u8(comp + x, avg);
+ }
+ comp += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--y);
+ } else if (width == 8) {
+ int i = width * height;
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ uint8x16_t r;
+ const uint8x8_t r_0 = vld1_u8(ref);
+ const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+ r = vcombine_u8(r_0, r_1);
+ ref += 2 * ref_stride;
+ r = vrhaddq_u8(r, p);
+ vst1q_u8(comp, r);
+
+ pred += 16;
+ comp += 16;
+ i -= 16;
+ } while (i);
+ } else {
+ int i = width * height;
+ assert(width == 4);
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ uint8x16_t r;
+
+ r = load_unaligned_u8q(ref, ref_stride);
+ ref += 4 * ref_stride;
+ r = vrhaddq_u8(r, p);
+ vst1q_u8(comp, r);
+
+ pred += 16;
+ comp += 16;
+ i -= 16;
+ } while (i);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c
new file mode 100644
index 0000000000..7efce32735
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+extern const int16_t vpx_rv[];
+
+static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1,
+ const uint8x8_t v0, const uint8x8_t b1,
+ const uint8x8_t b2) {
+ const uint8x8_t k1 = vrhadd_u8(a2, a1);
+ const uint8x8_t k2 = vrhadd_u8(b2, b1);
+ const uint8x8_t k3 = vrhadd_u8(k1, k2);
+ return vrhadd_u8(k3, v0);
+}
+
+static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1,
+ const uint8x8_t v0, const uint8x8_t b1,
+ const uint8x8_t b2, const uint8x8_t filter) {
+ const uint8x8_t a2_v0 = vabd_u8(a2, v0);
+ const uint8x8_t a1_v0 = vabd_u8(a1, v0);
+ const uint8x8_t b1_v0 = vabd_u8(b1, v0);
+ const uint8x8_t b2_v0 = vabd_u8(b2, v0);
+
+ uint8x8_t max = vmax_u8(a2_v0, a1_v0);
+ max = vmax_u8(b1_v0, max);
+ max = vmax_u8(b2_v0, max);
+ return vclt_u8(max, filter);
+}
+
+static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1,
+ const uint8x8_t v0, const uint8x8_t b1,
+ const uint8x8_t b2, const uint8x8_t filter) {
+ const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2);
+ const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter);
+
+ return vbsl_u8(mask, k_out, v0);
+}
+
+// Same functions but for uint8x16_t.
+static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1,
+ const uint8x16_t v0, const uint8x16_t b1,
+ const uint8x16_t b2) {
+ const uint8x16_t k1 = vrhaddq_u8(a2, a1);
+ const uint8x16_t k2 = vrhaddq_u8(b2, b1);
+ const uint8x16_t k3 = vrhaddq_u8(k1, k2);
+ return vrhaddq_u8(k3, v0);
+}
+
+static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1,
+ const uint8x16_t v0, const uint8x16_t b1,
+ const uint8x16_t b2, const uint8x16_t filter) {
+ const uint8x16_t a2_v0 = vabdq_u8(a2, v0);
+ const uint8x16_t a1_v0 = vabdq_u8(a1, v0);
+ const uint8x16_t b1_v0 = vabdq_u8(b1, v0);
+ const uint8x16_t b2_v0 = vabdq_u8(b2, v0);
+
+ uint8x16_t max = vmaxq_u8(a2_v0, a1_v0);
+ max = vmaxq_u8(b1_v0, max);
+ max = vmaxq_u8(b2_v0, max);
+ return vcltq_u8(max, filter);
+}
+
+static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1,
+ const uint8x16_t v0, const uint8x16_t b1,
+ const uint8x16_t b2,
+ const uint8x16_t filter) {
+ const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2);
+ const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter);
+
+ return vbslq_u8(mask, k_out, v0);
+}
+
+void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int dst_stride, int cols,
+ uint8_t *f, int size) {
+ uint8_t *src, *dst;
+ int row;
+ int col;
+
+ // While columns of length 16 can be processed, load them.
+ for (col = 0; col < cols - 8; col += 16) {
+ uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7;
+ src = src_ptr - 2 * src_stride;
+ dst = dst_ptr;
+
+ a0 = vld1q_u8(src);
+ src += src_stride;
+ a1 = vld1q_u8(src);
+ src += src_stride;
+ a2 = vld1q_u8(src);
+ src += src_stride;
+ a3 = vld1q_u8(src);
+ src += src_stride;
+
+ for (row = 0; row < size; row += 4) {
+ uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3;
+ const uint8x16_t filterq = vld1q_u8(f + col);
+
+ a4 = vld1q_u8(src);
+ src += src_stride;
+ a5 = vld1q_u8(src);
+ src += src_stride;
+ a6 = vld1q_u8(src);
+ src += src_stride;
+ a7 = vld1q_u8(src);
+ src += src_stride;
+
+ v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq);
+ v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq);
+ v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq);
+ v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq);
+
+ vst1q_u8(dst, v_out_0);
+ dst += dst_stride;
+ vst1q_u8(dst, v_out_1);
+ dst += dst_stride;
+ vst1q_u8(dst, v_out_2);
+ dst += dst_stride;
+ vst1q_u8(dst, v_out_3);
+ dst += dst_stride;
+
+ // Rotate over to the next slot.
+ a0 = a4;
+ a1 = a5;
+ a2 = a6;
+ a3 = a7;
+ }
+
+ src_ptr += 16;
+ dst_ptr += 16;
+ }
+
+ // Clean up any left over column of length 8.
+ if (col != cols) {
+ uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
+ src = src_ptr - 2 * src_stride;
+ dst = dst_ptr;
+
+ a0 = vld1_u8(src);
+ src += src_stride;
+ a1 = vld1_u8(src);
+ src += src_stride;
+ a2 = vld1_u8(src);
+ src += src_stride;
+ a3 = vld1_u8(src);
+ src += src_stride;
+
+ for (row = 0; row < size; row += 4) {
+ uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3;
+ const uint8x8_t filter = vld1_u8(f + col);
+
+ a4 = vld1_u8(src);
+ src += src_stride;
+ a5 = vld1_u8(src);
+ src += src_stride;
+ a6 = vld1_u8(src);
+ src += src_stride;
+ a7 = vld1_u8(src);
+ src += src_stride;
+
+ v_out_0 = generate_output(a0, a1, a2, a3, a4, filter);
+ v_out_1 = generate_output(a1, a2, a3, a4, a5, filter);
+ v_out_2 = generate_output(a2, a3, a4, a5, a6, filter);
+ v_out_3 = generate_output(a3, a4, a5, a6, a7, filter);
+
+ vst1_u8(dst, v_out_0);
+ dst += dst_stride;
+ vst1_u8(dst, v_out_1);
+ dst += dst_stride;
+ vst1_u8(dst, v_out_2);
+ dst += dst_stride;
+ vst1_u8(dst, v_out_3);
+ dst += dst_stride;
+
+ // Rotate over to the next slot.
+ a0 = a4;
+ a1 = a5;
+ a2 = a6;
+ a3 = a7;
+ }
+
+ // Not strictly necessary but makes resetting dst_ptr easier.
+ dst_ptr += 8;
+ }
+
+ dst_ptr -= cols;
+
+ for (row = 0; row < size; row += 8) {
+ uint8x8_t a0, a1, a2, a3;
+ uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+ src = dst_ptr;
+ dst = dst_ptr;
+
+ // Load 8 values, transpose 4 of them, and discard 2 because they will be
+ // reloaded later.
+ load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3);
+ a3 = a1;
+ a2 = a1 = a0; // Extend left border.
+
+ src += 2;
+
+ for (col = 0; col < cols; col += 8) {
+ uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6,
+ v_out_7;
+ // Although the filter is meant to be applied vertically and is instead
+ // being applied horizontally here it's OK because it's set in blocks of 8
+ // (or 16).
+ const uint8x8_t filter = vld1_u8(f + col);
+
+ load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5,
+ &b6, &b7);
+
+ if (col + 8 == cols) {
+ // Last row. Extend border (b5).
+ b6 = b7 = b5;
+ }
+
+ v_out_0 = generate_output(a0, a1, a2, a3, b0, filter);
+ v_out_1 = generate_output(a1, a2, a3, b0, b1, filter);
+ v_out_2 = generate_output(a2, a3, b0, b1, b2, filter);
+ v_out_3 = generate_output(a3, b0, b1, b2, b3, filter);
+ v_out_4 = generate_output(b0, b1, b2, b3, b4, filter);
+ v_out_5 = generate_output(b1, b2, b3, b4, b5, filter);
+ v_out_6 = generate_output(b2, b3, b4, b5, b6, filter);
+ v_out_7 = generate_output(b3, b4, b5, b6, b7, filter);
+
+ transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2,
+ v_out_3, v_out_4, v_out_5, v_out_6, v_out_7);
+
+ a0 = b4;
+ a1 = b5;
+ a2 = b6;
+ a3 = b7;
+
+ src += 8;
+ dst += 8;
+ }
+
+ dst_ptr += 8 * dst_stride;
+ }
+}
+
+// sum += x;
+// sumsq += x * y;
+static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy,
+ int16x4_t *const sum, int32x4_t *const sumsq) {
+ const int16x4_t zero = vdup_n_s16(0);
+ const int32x4_t zeroq = vdupq_n_s32(0);
+
+ // Add in the first set because vext doesn't work with '0'.
+ *sum = vadd_s16(*sum, x);
+ *sumsq = vaddq_s32(*sumsq, xy);
+
+ // Shift x and xy to the right and sum. vext requires an immediate.
+ *sum = vadd_s16(*sum, vext_s16(zero, x, 1));
+ *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1));
+
+ *sum = vadd_s16(*sum, vext_s16(zero, x, 2));
+ *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2));
+
+ *sum = vadd_s16(*sum, vext_s16(zero, x, 3));
+ *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3));
+}
+
+// Generate mask based on (sumsq * 15 - sum * sum < flimit)
+static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq,
+ const int32x4_t f, const int32x4_t fifteen) {
+ const int32x4_t a = vmulq_s32(sumsq, fifteen);
+ const int32x4_t b = vmlsl_s16(a, sum, sum);
+ const uint32x4_t mask32 = vcltq_s32(b, f);
+ return vmovn_u32(mask32);
+}
+
+static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high,
+ const int32x4_t sumsq_low,
+ const int32x4_t sumsq_high, const int32x4_t f) {
+ const int32x4_t fifteen = vdupq_n_s32(15);
+ const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen);
+ const uint16x4_t mask16_high =
+ calculate_mask(sum_high, sumsq_high, f, fifteen);
+ return vmovn_u16(vcombine_u16(mask16_low, mask16_high));
+}
+
+// Apply filter of (8 + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) {
+ const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+ const int16x8_t sum_s = vaddq_s16(sum, s16);
+
+ return vqrshrun_n_s16(sum_s, 4);
+}
+
+void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols,
+ int flimit) {
+ int row, col;
+ const int32x4_t f = vdupq_n_s32(flimit);
+
+ assert(cols % 8 == 0);
+
+ for (row = 0; row < rows; ++row) {
+ // Sum the first 8 elements, which are extended from s[0].
+ // sumsq gets primed with +16.
+ int sumsq = src[0] * src[0] * 9 + 16;
+ int sum = src[0] * 9;
+
+ uint8x8_t left_context, s, right_context;
+ int16x4_t sum_low, sum_high;
+ int32x4_t sumsq_low, sumsq_high;
+
+ // Sum (+square) the next 6 elements.
+ // Skip [0] because it's included above.
+ for (col = 1; col <= 6; ++col) {
+ sumsq += src[col] * src[col];
+ sum += src[col];
+ }
+
+ // Prime the sums. Later the loop uses the _high values to prime the new
+ // vectors.
+ sumsq_high = vdupq_n_s32(sumsq);
+ sum_high = vdup_n_s16(sum);
+
+ // Manually extend the left border.
+ left_context = vdup_n_u8(src[0]);
+
+ for (col = 0; col < cols; col += 8) {
+ uint8x8_t mask, output;
+ int16x8_t x, y;
+ int32x4_t xy_low, xy_high;
+
+ s = vld1_u8(src + col);
+
+ if (col + 8 == cols) {
+ // Last row. Extend border.
+ right_context = vdup_n_u8(src[col + 7]);
+ } else {
+ right_context = vld1_u8(src + col + 7);
+ }
+
+ x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context));
+ y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context));
+ xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+ xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+ // Catch up to the last sum'd value.
+ sum_low = vdup_lane_s16(sum_high, 3);
+ sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1);
+
+ accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low);
+
+ // Need to do this sequentially because we need the max value from
+ // sum_low.
+ sum_high = vdup_lane_s16(sum_low, 3);
+ sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1);
+
+ accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high);
+
+ mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f);
+
+ output = filter_pixels(vcombine_s16(sum_low, sum_high), s);
+ output = vbsl_u8(mask, output, s);
+
+ vst1_u8(src + col, output);
+
+ left_context = s;
+ }
+
+ src += pitch;
+ }
+}
+
+// Apply filter of (vpx_rv + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s,
+ const int16x8_t rv) {
+ const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+ const int16x8_t sum_s = vaddq_s16(sum, s16);
+ const int16x8_t rounded = vaddq_s16(sum_s, rv);
+
+ return vqshrun_n_s16(rounded, 4);
+}
+
+void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols,
+ int flimit) {
+ int row, col, i;
+ const int32x4_t f = vdupq_n_s32(flimit);
+ uint8x8_t below_context = vdup_n_u8(0);
+
+ // 8 columns are processed at a time.
+ // If rows is less than 8 the bottom border extension fails.
+ assert(cols % 8 == 0);
+ assert(rows >= 8);
+
+ // Load and keep the first 8 values in memory. Process a vertical stripe that
+ // is 8 wide.
+ for (col = 0; col < cols; col += 8) {
+ uint8x8_t s, above_context[8];
+ int16x8_t sum, sum_tmp;
+ int32x4_t sumsq_low, sumsq_high;
+
+ // Load and extend the top border.
+ s = vld1_u8(dst);
+ for (i = 0; i < 8; i++) {
+ above_context[i] = s;
+ }
+
+ sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s));
+
+ // sum * 9
+ sum = vmulq_n_s16(sum_tmp, 9);
+
+ // (sum * 9) * sum == sum * sum * 9
+ sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp));
+ sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp));
+
+ // Load and discard the next 6 values to prime sum and sumsq.
+ for (i = 1; i <= 6; ++i) {
+ const uint8x8_t a = vld1_u8(dst + i * pitch);
+ const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a));
+ sum = vaddq_s16(sum, b);
+
+ sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b));
+ sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b));
+ }
+
+ for (row = 0; row < rows; ++row) {
+ uint8x8_t mask, output;
+ int16x8_t x, y;
+ int32x4_t xy_low, xy_high;
+
+ s = vld1_u8(dst + row * pitch);
+
+ // Extend the bottom border.
+ if (row + 7 < rows) {
+ below_context = vld1_u8(dst + (row + 7) * pitch);
+ }
+
+ x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0]));
+ y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0]));
+ xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+ xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+ sum = vaddq_s16(sum, x);
+
+ sumsq_low = vaddq_s32(sumsq_low, xy_low);
+ sumsq_high = vaddq_s32(sumsq_high, xy_high);
+
+ mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low,
+ sumsq_high, f);
+
+ output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127)));
+ output = vbsl_u8(mask, output, s);
+
+ vst1_u8(dst + row * pitch, output);
+
+ above_context[0] = above_context[1];
+ above_context[1] = above_context[2];
+ above_context[2] = above_context[3];
+ above_context[3] = above_context[4];
+ above_context[4] = above_context[5];
+ above_context[5] = above_context[6];
+ above_context[6] = above_context[7];
+ above_context[7] = s;
+ }
+
+ dst += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c
new file mode 100644
index 0000000000..fde71ff30d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
+
+// Some builds of gcc 4.9.2 and .3 have trouble with some of the inline
+// functions.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+ __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+ vpx_fdct16x16_c(input, output, stride);
+}
+
+#else
+
+// Main body of fdct16x16.
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+ int16x8_t *out /*[16]*/) {
+ int16x8_t s[8];
+ int16x8_t x[4];
+ int16x8_t step[8];
+
+ // stage 1
+ // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+ // even_results);"
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+ &out[8]);
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
+
+ // Stage 2
+ // Re-using source s5/s6
+ // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+ // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+ butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], s[5]);
+ x[1] = vsubq_s16(s[4], s[5]);
+ x[2] = vsubq_s16(s[7], s[6]);
+ x[3] = vaddq_s16(s[7], s[6]);
+
+ // Stage 4
+ // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+ // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
+
+ // step 2
+ // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+ // That file distinguished between "in_high" and "step1" but the only
+ // difference is that "in_high" is the first 8 values and "step 1" is the
+ // second. Here, since they are all in one array, "step1" values are += 8.
+
+ // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+ // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+ // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+ // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+ butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+ butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+ // step 3
+ s[0] = vaddq_s16(in[8], s[3]);
+ s[1] = vaddq_s16(in[9], s[2]);
+ x[0] = vsubq_s16(in[9], s[2]);
+ x[1] = vsubq_s16(in[8], s[3]);
+ x[2] = vsubq_s16(in[15], s[4]);
+ x[3] = vsubq_s16(in[14], s[5]);
+ s[6] = vaddq_s16(in[14], s[5]);
+ s[7] = vaddq_s16(in[15], s[4]);
+
+ // step 4
+ // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] *
+ // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+ // * cospi_8_64)
+ butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
+
+ // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] *
+ // cospi_24_64)
+ butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
+
+ // step 5
+ step[0] = vaddq_s16(s[0], s[1]);
+ step[1] = vsubq_s16(s[0], s[1]);
+ step[2] = vaddq_s16(x[1], s[2]);
+ step[3] = vsubq_s16(x[1], s[2]);
+ step[4] = vsubq_s16(x[2], s[5]);
+ step[5] = vaddq_s16(x[2], s[5]);
+ step[6] = vsubq_s16(s[7], s[6]);
+ step[7] = vaddq_s16(s[7], s[6]);
+
+ // step 6
+ // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+ // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+ butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
+ &out[7]);
+ // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64)
+ // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+ butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
+ &out[15]);
+
+ // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+ // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64)
+ butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
+ &out[3]);
+
+ // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+ // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+ butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
+ &out[11]);
+}
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int16x8_t temp0[16];
+ int16x8_t temp1[16];
+ int16x8_t temp2[16];
+ int16x8_t temp3[16];
+
+ // Left half.
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp1);
+ vpx_fdct8x16_body(temp1, temp0);
+
+ // Right half.
+ load_cross(input + 8, stride, temp1);
+ scale_input(temp1, temp2);
+ vpx_fdct8x16_body(temp2, temp1);
+
+ // Transpose top left and top right quarters into one contiguous location to
+ // process to the top half.
+
+ transpose_s16_8x8q(&temp0[0], &temp2[0]);
+ transpose_s16_8x8q(&temp1[0], &temp2[8]);
+ partial_round_shift(temp2);
+ cross_input(temp2, temp3);
+ vpx_fdct8x16_body(temp3, temp2);
+ transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
+ &temp2[5], &temp2[6], &temp2[7]);
+ transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
+ &temp2[13], &temp2[14], &temp2[15]);
+ store(output, temp2);
+ store(output + 8, temp2 + 8);
+ output += 8 * 16;
+
+ // Transpose bottom left and bottom right quarters into one contiguous
+ // location to process to the bottom half.
+ transpose_s16_8x8q(&temp0[8], &temp1[0]);
+
+ transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+ &temp1[13], &temp1[14], &temp1[15]);
+ partial_round_shift(temp1);
+ cross_input(temp1, temp0);
+ vpx_fdct8x16_body(temp0, temp1);
+ transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
+ &temp1[5], &temp1[6], &temp1[7]);
+ transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+ &temp1[13], &temp1[14], &temp1[15]);
+ store(output, temp1);
+ store(output + 8, temp1 + 8);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ int32x4_t sl[8];
+ int32x4_t sr[8];
+ int32x4_t xl[4];
+ int32x4_t xr[4];
+ int32x4_t inl[8];
+ int32x4_t inr[8];
+ int32x4_t stepl[8];
+ int32x4_t stepr[8];
+
+ // stage 1
+ // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+ // even_results);"
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // Copy values 8-15 as we're storing in-place
+ inl[0] = left[8];
+ inr[0] = right[8];
+ inl[1] = left[9];
+ inr[1] = right[9];
+ inl[2] = left[10];
+ inr[2] = right[10];
+ inl[3] = left[11];
+ inr[3] = right[11];
+ inl[4] = left[12];
+ inr[4] = right[12];
+ inl[5] = left[13];
+ inr[5] = right[13];
+ inl[6] = left[14];
+ inr[6] = right[14];
+ inl[7] = left[15];
+ inr[7] = right[15];
+
+ // fdct4(step, step);
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[8], &right[8]);
+
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+ cospi_24_64, &left[4], &right[4],
+ &left[12], &right[12]);
+
+ // Stage 2
+ // Re-using source s5/s6
+ // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+ // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+ &sr[6], &sl[5], &sr[5]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], sl[5]);
+ xr[0] = vaddq_s32(sr[4], sr[5]);
+ xl[1] = vsubq_s32(sl[4], sl[5]);
+ xr[1] = vsubq_s32(sr[4], sr[5]);
+ xl[2] = vsubq_s32(sl[7], sl[6]);
+ xr[2] = vsubq_s32(sr[7], sr[6]);
+ xl[3] = vaddq_s32(sl[7], sl[6]);
+ xr[3] = vaddq_s32(sr[7], sr[6]);
+
+ // Stage 4
+ // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+ cospi_28_64, &left[2], &right[2],
+ &left[14], &right[14]);
+ // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+ cospi_12_64, &left[10], &right[10],
+ &left[6], &right[6]);
+
+ // step 2
+ // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+ // That file distinguished between "in_high" and "step1" but the only
+ // difference is that "in_high" is the first 8 values and "step 1" is the
+ // second. Here, since they are all in one array, "step1" values are += 8.
+
+ // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+ // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+ // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+ // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+ &sl[5], &sr[5], &sl[2], &sr[2]);
+ butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+ &sl[4], &sr[4], &sl[3], &sr[3]);
+
+ // step 3
+ sl[0] = vaddq_s32(inl[0], sl[3]);
+ sr[0] = vaddq_s32(inr[0], sr[3]);
+ sl[1] = vaddq_s32(inl[1], sl[2]);
+ sr[1] = vaddq_s32(inr[1], sr[2]);
+ xl[0] = vsubq_s32(inl[1], sl[2]);
+ xr[0] = vsubq_s32(inr[1], sr[2]);
+ xl[1] = vsubq_s32(inl[0], sl[3]);
+ xr[1] = vsubq_s32(inr[0], sr[3]);
+ xl[2] = vsubq_s32(inl[7], sl[4]);
+ xr[2] = vsubq_s32(inr[7], sr[4]);
+ xl[3] = vsubq_s32(inl[6], sl[5]);
+ xr[3] = vsubq_s32(inr[6], sr[5]);
+ sl[6] = vaddq_s32(inl[6], sl[5]);
+ sr[6] = vaddq_s32(inr[6], sr[5]);
+ sl[7] = vaddq_s32(inl[7], sl[4]);
+ sr[7] = vaddq_s32(inr[7], sr[4]);
+
+ // step 4
+ // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] *
+ // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+ // * cospi_8_64)
+ butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+ cospi_24_64, &sl[6], &sr[6], &sl[1],
+ &sr[1]);
+ // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] *
+ // cospi_24_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+ cospi_8_64, &sl[2], &sr[2], &sl[5],
+ &sr[5]);
+
+ // step 5
+ stepl[0] = vaddq_s32(sl[0], sl[1]);
+ stepr[0] = vaddq_s32(sr[0], sr[1]);
+ stepl[1] = vsubq_s32(sl[0], sl[1]);
+ stepr[1] = vsubq_s32(sr[0], sr[1]);
+ stepl[2] = vaddq_s32(xl[1], sl[2]);
+ stepr[2] = vaddq_s32(xr[1], sr[2]);
+ stepl[3] = vsubq_s32(xl[1], sl[2]);
+ stepr[3] = vsubq_s32(xr[1], sr[2]);
+ stepl[4] = vsubq_s32(xl[2], sl[5]);
+ stepr[4] = vsubq_s32(xr[2], sr[5]);
+ stepl[5] = vaddq_s32(xl[2], sl[5]);
+ stepr[5] = vaddq_s32(xr[2], sr[5]);
+ stepl[6] = vsubq_s32(sl[7], sl[6]);
+ stepr[6] = vsubq_s32(sr[7], sr[6]);
+ stepl[7] = vaddq_s32(sl[7], sl[6]);
+ stepr[7] = vaddq_s32(sr[7], sr[6]);
+
+ // step 6
+ // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+ // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+ cospi_18_64, cospi_14_64, &left[9],
+ &right[9], &left[7], &right[7]);
+ // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64)
+ // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+ cospi_2_64, cospi_30_64, &left[1],
+ &right[1], &left[15], &right[15]);
+ // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+ // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+ cospi_26_64, cospi_6_64, &left[13],
+ &right[13], &left[3], &right[3]);
+ // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+ // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+ cospi_10_64, cospi_22_64, &left[5],
+ &right[5], &left[11], &right[11]);
+}
+
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[16];
+ int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16],
+ right3[16], right4[16];
+
+ // Left half.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
+
+ // right half.
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
+
+ // Transpose top left and top right quarters into one contiguous location to
+ // process to the top half.
+
+ transpose_s32_8x8_2(left1, right1, left3, right3);
+ transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+
+ highbd_partial_round_shift(left3, right3);
+ highbd_cross_input(left3, right3, left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
+
+ // Transpose bottom left and bottom right quarters into one contiguous
+ // location to process to the bottom half.
+
+ highbd_partial_round_shift(left4, right4);
+ highbd_cross_input(left4, right4, left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
+
+ transpose_s32_8x8_2(left1, right1, left3, right3);
+ transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+ store16_s32(output, left3);
+ output += 4;
+ store16_s32(output, right3);
+ output += 4;
+
+ store16_s32(output, left4);
+ output += 4;
+ store16_s32(output, right4);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+ // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h
new file mode 100644
index 0000000000..cd58675ca4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
+
+#include <arm_neon.h>
+
+#include "fdct_neon.h"
+
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
+ b[0] = vld1q_s16(a);
+ a += stride;
+ b[1] = vld1q_s16(a);
+ a += stride;
+ b[2] = vld1q_s16(a);
+ a += stride;
+ b[3] = vld1q_s16(a);
+ a += stride;
+ b[4] = vld1q_s16(a);
+ a += stride;
+ b[5] = vld1q_s16(a);
+ a += stride;
+ b[6] = vld1q_s16(a);
+ a += stride;
+ b[7] = vld1q_s16(a);
+ a += stride;
+ b[8] = vld1q_s16(a);
+ a += stride;
+ b[9] = vld1q_s16(a);
+ a += stride;
+ b[10] = vld1q_s16(a);
+ a += stride;
+ b[11] = vld1q_s16(a);
+ a += stride;
+ b[12] = vld1q_s16(a);
+ a += stride;
+ b[13] = vld1q_s16(a);
+ a += stride;
+ b[14] = vld1q_s16(a);
+ a += stride;
+ b[15] = vld1q_s16(a);
+}
+
+// Store 8 16x8 values, assuming stride == 16.
+static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
+ store_s16q_to_tran_low(a, b[0]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[1]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[2]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[3]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[4]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[5]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[6]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[7]);
+}
+
+// Load step of each pass. Add and subtract clear across the input, requiring
+// all 16 values to be loaded. For the first pass it also multiplies by 4.
+
+// To maybe reduce register usage this could be combined with the load() step to
+// get the first 4 and last 4 values, cross those, then load the middle 8 values
+// and cross them.
+static INLINE void scale_input(const int16x8_t *a /*[16]*/,
+ int16x8_t *b /*[16]*/) {
+ b[0] = vshlq_n_s16(a[0], 2);
+ b[1] = vshlq_n_s16(a[1], 2);
+ b[2] = vshlq_n_s16(a[2], 2);
+ b[3] = vshlq_n_s16(a[3], 2);
+ b[4] = vshlq_n_s16(a[4], 2);
+ b[5] = vshlq_n_s16(a[5], 2);
+ b[6] = vshlq_n_s16(a[6], 2);
+ b[7] = vshlq_n_s16(a[7], 2);
+
+ b[8] = vshlq_n_s16(a[8], 2);
+ b[9] = vshlq_n_s16(a[9], 2);
+ b[10] = vshlq_n_s16(a[10], 2);
+ b[11] = vshlq_n_s16(a[11], 2);
+ b[12] = vshlq_n_s16(a[12], 2);
+ b[13] = vshlq_n_s16(a[13], 2);
+ b[14] = vshlq_n_s16(a[14], 2);
+ b[15] = vshlq_n_s16(a[15], 2);
+}
+
+static INLINE void cross_input(const int16x8_t *a /*[16]*/,
+ int16x8_t *b /*[16]*/) {
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+}
+
+static INLINE void load_cross(const int16_t *a, int stride,
+ int16x8_t *b /*[16]*/) {
+ b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+ b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+ b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+ b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+ b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+ b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+ b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+ b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+
+ b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+ b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+ b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+ b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+ b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+ b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+ b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+ b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+}
+
+// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
+// because this only adds 1, not 1 << 2.
+static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
+ const int16x8_t one = vdupq_n_s16(1);
+ a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
+ a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
+ a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
+ a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
+ a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
+ a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
+ a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
+ a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
+ a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
+ a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
+ a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
+ a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
+ a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
+ a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
+ a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
+ a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
+ int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+ left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+ left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+ left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+ left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+ left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+ left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+ left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+ left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+
+ right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+ right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+ right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+ right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+ right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+ right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+ right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+ right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+ right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/,
+ int32x4_t *a_right /*[16]*/,
+ int32x4_t *b_left /*[16]*/,
+ int32x4_t *b_right /*[16]*/) {
+ b_left[0] = vaddq_s32(a_left[0], a_left[15]);
+ b_left[1] = vaddq_s32(a_left[1], a_left[14]);
+ b_left[2] = vaddq_s32(a_left[2], a_left[13]);
+ b_left[3] = vaddq_s32(a_left[3], a_left[12]);
+ b_left[4] = vaddq_s32(a_left[4], a_left[11]);
+ b_left[5] = vaddq_s32(a_left[5], a_left[10]);
+ b_left[6] = vaddq_s32(a_left[6], a_left[9]);
+ b_left[7] = vaddq_s32(a_left[7], a_left[8]);
+
+ b_right[0] = vaddq_s32(a_right[0], a_right[15]);
+ b_right[1] = vaddq_s32(a_right[1], a_right[14]);
+ b_right[2] = vaddq_s32(a_right[2], a_right[13]);
+ b_right[3] = vaddq_s32(a_right[3], a_right[12]);
+ b_right[4] = vaddq_s32(a_right[4], a_right[11]);
+ b_right[5] = vaddq_s32(a_right[5], a_right[10]);
+ b_right[6] = vaddq_s32(a_right[6], a_right[9]);
+ b_right[7] = vaddq_s32(a_right[7], a_right[8]);
+
+ b_left[8] = vsubq_s32(a_left[7], a_left[8]);
+ b_left[9] = vsubq_s32(a_left[6], a_left[9]);
+ b_left[10] = vsubq_s32(a_left[5], a_left[10]);
+ b_left[11] = vsubq_s32(a_left[4], a_left[11]);
+ b_left[12] = vsubq_s32(a_left[3], a_left[12]);
+ b_left[13] = vsubq_s32(a_left[2], a_left[13]);
+ b_left[14] = vsubq_s32(a_left[1], a_left[14]);
+ b_left[15] = vsubq_s32(a_left[0], a_left[15]);
+
+ b_right[8] = vsubq_s32(a_right[7], a_right[8]);
+ b_right[9] = vsubq_s32(a_right[6], a_right[9]);
+ b_right[10] = vsubq_s32(a_right[5], a_right[10]);
+ b_right[11] = vsubq_s32(a_right[4], a_right[11]);
+ b_right[12] = vsubq_s32(a_right[3], a_right[12]);
+ b_right[13] = vsubq_s32(a_right[2], a_right[13]);
+ b_right[14] = vsubq_s32(a_right[1], a_right[14]);
+ b_right[15] = vsubq_s32(a_right[0], a_right[15]);
+}
+
+static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ const int32x4_t one = vdupq_n_s32(1);
+ left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
+ left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
+ left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
+ left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
+ left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
+ left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
+ left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
+ left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
+ left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
+ left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
+ left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
+ left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
+ left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
+ left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
+ left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
+ left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
+
+ right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
+ right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
+ right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
+ right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
+ right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
+ right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
+ right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
+ right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
+ right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
+ right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
+ right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
+ right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
+ right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
+ right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
+ right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
+ right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
+}
+
+// Store 16 32x4 vectors, assuming stride == 16.
+static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
+ vst1q_s32(a, b[0]);
+ a += 16;
+ vst1q_s32(a, b[1]);
+ a += 16;
+ vst1q_s32(a, b[2]);
+ a += 16;
+ vst1q_s32(a, b[3]);
+ a += 16;
+ vst1q_s32(a, b[4]);
+ a += 16;
+ vst1q_s32(a, b[5]);
+ a += 16;
+ vst1q_s32(a, b[6]);
+ a += 16;
+ vst1q_s32(a, b[7]);
+ a += 16;
+ vst1q_s32(a, b[8]);
+ a += 16;
+ vst1q_s32(a, b[9]);
+ a += 16;
+ vst1q_s32(a, b[10]);
+ a += 16;
+ vst1q_s32(a, b[11]);
+ a += 16;
+ vst1q_s32(a, b[12]);
+ a += 16;
+ vst1q_s32(a, b[13]);
+ a += 16;
+ vst1q_s32(a, b[14]);
+ a += 16;
+ vst1q_s32(a, b[15]);
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c
new file mode 100644
index 0000000000..a91730ce8b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct32x32_neon.h"
+
+// Most gcc 4.9 distributions outside of Android do not generate correct code
+// for this function.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+ __GNUC__ == 4 && __GNUC_MINOR__ <= 9
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+ vpx_fdct32x32_c(input, output, stride);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct32x32_rd_c(input, output, stride);
+}
+
+#else
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int16x8_t temp0[32];
+ int16x8_t temp1[32];
+ int16x8_t temp2[32];
+ int16x8_t temp3[32];
+ int16x8_t temp4[32];
+ int16x8_t temp5[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp1);
+
+ load_cross(input + 8, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp2);
+
+ load_cross(input + 16, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp3);
+
+ load_cross(input + 24, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s16_8x8q(&temp1[0], &temp0[0]);
+ transpose_s16_8x8q(&temp2[0], &temp0[8]);
+ transpose_s16_8x8q(&temp3[0], &temp0[16]);
+ transpose_s16_8x8q(&temp4[0], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output, temp5);
+
+ // Second row of 8x32.
+ transpose_s16_8x8q(&temp1[8], &temp0[0]);
+ transpose_s16_8x8q(&temp2[8], &temp0[8]);
+ transpose_s16_8x8q(&temp3[8], &temp0[16]);
+ transpose_s16_8x8q(&temp4[8], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 8 * 32, temp5);
+
+ // Third row of 8x32
+ transpose_s16_8x8q(&temp1[16], &temp0[0]);
+ transpose_s16_8x8q(&temp2[16], &temp0[8]);
+ transpose_s16_8x8q(&temp3[16], &temp0[16]);
+ transpose_s16_8x8q(&temp4[16], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 16 * 32, temp5);
+
+ // Final row of 8x32.
+ transpose_s16_8x8q(&temp1[24], &temp0[0]);
+ transpose_s16_8x8q(&temp2[24], &temp0[8]);
+ transpose_s16_8x8q(&temp3[24], &temp0[16]);
+ transpose_s16_8x8q(&temp4[24], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 24 * 32, temp5);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int16x8_t temp1[32];
+ int16x8_t temp2[32];
+ int16x8_t temp3[32];
+ int16x8_t temp4[32];
+ int16x8_t temp5[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp1);
+
+ load_cross(input + 8, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp2);
+
+ load_cross(input + 16, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp3);
+
+ load_cross(input + 24, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s16_8x8q(&temp1[0], &temp0[0]);
+ transpose_s16_8x8q(&temp2[0], &temp0[8]);
+ transpose_s16_8x8q(&temp3[0], &temp0[16]);
+ transpose_s16_8x8q(&temp4[0], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output, temp5);
+
+ // Second row of 8x32.
+ transpose_s16_8x8q(&temp1[8], &temp0[0]);
+ transpose_s16_8x8q(&temp2[8], &temp0[8]);
+ transpose_s16_8x8q(&temp3[8], &temp0[16]);
+ transpose_s16_8x8q(&temp4[8], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 8 * 32, temp5);
+
+ // Third row of 8x32
+ transpose_s16_8x8q(&temp1[16], &temp0[0]);
+ transpose_s16_8x8q(&temp2[16], &temp0[8]);
+ transpose_s16_8x8q(&temp3[16], &temp0[16]);
+ transpose_s16_8x8q(&temp4[16], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 16 * 32, temp5);
+
+ // Final row of 8x32.
+ transpose_s16_8x8q(&temp1[24], &temp0[0]);
+ transpose_s16_8x8q(&temp2[24], &temp0[8]);
+ transpose_s16_8x8q(&temp3[24], &temp0[16]);
+ transpose_s16_8x8q(&temp4[24], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 24 * 32, temp5);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+ right3[32], right4[32];
+ int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+ left8[32], right8[32];
+ int32x4_t temp1[32], temp2[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ highbd_dct8x32_body_first_pass(left1, right1);
+ highbd_partial_sub_round_shift(left1, right1);
+
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ highbd_dct8x32_body_first_pass(left2, right2);
+ highbd_partial_sub_round_shift(left2, right2);
+
+ load_cross(input + 16, stride, temp0);
+ highbd_scale_input(temp0, left3, right3);
+ highbd_dct8x32_body_first_pass(left3, right3);
+ highbd_partial_sub_round_shift(left3, right3);
+
+ load_cross(input + 24, stride, temp0);
+ highbd_scale_input(temp0, left4, right4);
+ highbd_dct8x32_body_first_pass(left4, right4);
+ highbd_partial_sub_round_shift(left4, right4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s32_8x8_2(left1, right1, temp1, temp2);
+ transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left5, right5);
+ highbd_dct8x32_body_second_pass(left5, right5);
+ highbd_partial_add_round_shift(left5, right5);
+
+ // Second row of 8x32.
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left6, right6);
+ highbd_dct8x32_body_second_pass(left6, right6);
+ highbd_partial_add_round_shift(left6, right6);
+
+ // Third row of 8x32
+ transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left7, right7);
+ highbd_dct8x32_body_second_pass(left7, right7);
+ highbd_partial_add_round_shift(left7, right7);
+
+ // Final row of 8x32.
+ transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left8, right8);
+ highbd_dct8x32_body_second_pass(left8, right8);
+ highbd_partial_add_round_shift(left8, right8);
+
+ // Final transpose
+ transpose_s32_8x8_2(left5, right5, left1, right1);
+ transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+ transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+ transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+ transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+ transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+ transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+ transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+ transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+ transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+ transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+ transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+ transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+ transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+ store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+ right4);
+}
+
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+ right3[32], right4[32];
+ int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+ left8[32], right8[32];
+ int32x4_t temp1[32], temp2[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ highbd_dct8x32_body_first_pass(left1, right1);
+ highbd_partial_sub_round_shift(left1, right1);
+
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ highbd_dct8x32_body_first_pass(left2, right2);
+ highbd_partial_sub_round_shift(left2, right2);
+
+ load_cross(input + 16, stride, temp0);
+ highbd_scale_input(temp0, left3, right3);
+ highbd_dct8x32_body_first_pass(left3, right3);
+ highbd_partial_sub_round_shift(left3, right3);
+
+ load_cross(input + 24, stride, temp0);
+ highbd_scale_input(temp0, left4, right4);
+ highbd_dct8x32_body_first_pass(left4, right4);
+ highbd_partial_sub_round_shift(left4, right4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s32_8x8_2(left1, right1, temp1, temp2);
+ transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left5, right5);
+ highbd_dct8x32_body_second_pass_rd(left5, right5);
+
+ // Second row of 8x32.
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left6, right6);
+ highbd_dct8x32_body_second_pass_rd(left6, right6);
+
+ // Third row of 8x32
+ transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left7, right7);
+ highbd_dct8x32_body_second_pass_rd(left7, right7);
+
+ // Final row of 8x32.
+ transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left8, right8);
+ highbd_dct8x32_body_second_pass_rd(left8, right8);
+
+ // Final transpose
+ transpose_s32_8x8_2(left5, right5, left1, right1);
+ transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+ transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+ transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+ transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+ transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+ transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+ transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+ transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+ transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+ transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+ transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+ transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+ transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+ store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+ right4);
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+ // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h
new file mode 100644
index 0000000000..3b9e64c6df
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h
@@ -0,0 +1,2919 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+// Load & cross the first 8 and last 8, then the middle
+static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
+ b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+ b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+ b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+ b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+ b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+ b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+ b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+ b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+
+ b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+ b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+ b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+ b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+ b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+ b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+ b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+ b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+
+ b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+ b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+ b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+ b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+ b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+ b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+ b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+ b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+
+ b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+ b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+ b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+ b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+ b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+ b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+ b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+ b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+}
+
+#define STORE_S16(src, index, dest) \
+ do { \
+ store_s16q_to_tran_low(dest, src[index]); \
+ dest += 8; \
+ } while (0)
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+ STORE_S16(b, 0, a);
+ STORE_S16(b, 8, a);
+ STORE_S16(b, 16, a);
+ STORE_S16(b, 24, a);
+ STORE_S16(b, 1, a);
+ STORE_S16(b, 9, a);
+ STORE_S16(b, 17, a);
+ STORE_S16(b, 25, a);
+ STORE_S16(b, 2, a);
+ STORE_S16(b, 10, a);
+ STORE_S16(b, 18, a);
+ STORE_S16(b, 26, a);
+ STORE_S16(b, 3, a);
+ STORE_S16(b, 11, a);
+ STORE_S16(b, 19, a);
+ STORE_S16(b, 27, a);
+ STORE_S16(b, 4, a);
+ STORE_S16(b, 12, a);
+ STORE_S16(b, 20, a);
+ STORE_S16(b, 28, a);
+ STORE_S16(b, 5, a);
+ STORE_S16(b, 13, a);
+ STORE_S16(b, 21, a);
+ STORE_S16(b, 29, a);
+ STORE_S16(b, 6, a);
+ STORE_S16(b, 14, a);
+ STORE_S16(b, 22, a);
+ STORE_S16(b, 30, a);
+ STORE_S16(b, 7, a);
+ STORE_S16(b, 15, a);
+ STORE_S16(b, 23, a);
+ STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+static INLINE void scale_input(const int16x8_t *in /*32*/,
+ int16x8_t *out /*32*/) {
+ out[0] = vshlq_n_s16(in[0], 2);
+ out[1] = vshlq_n_s16(in[1], 2);
+ out[2] = vshlq_n_s16(in[2], 2);
+ out[3] = vshlq_n_s16(in[3], 2);
+ out[4] = vshlq_n_s16(in[4], 2);
+ out[5] = vshlq_n_s16(in[5], 2);
+ out[6] = vshlq_n_s16(in[6], 2);
+ out[7] = vshlq_n_s16(in[7], 2);
+
+ out[8] = vshlq_n_s16(in[8], 2);
+ out[9] = vshlq_n_s16(in[9], 2);
+ out[10] = vshlq_n_s16(in[10], 2);
+ out[11] = vshlq_n_s16(in[11], 2);
+ out[12] = vshlq_n_s16(in[12], 2);
+ out[13] = vshlq_n_s16(in[13], 2);
+ out[14] = vshlq_n_s16(in[14], 2);
+ out[15] = vshlq_n_s16(in[15], 2);
+
+ out[16] = vshlq_n_s16(in[16], 2);
+ out[17] = vshlq_n_s16(in[17], 2);
+ out[18] = vshlq_n_s16(in[18], 2);
+ out[19] = vshlq_n_s16(in[19], 2);
+ out[20] = vshlq_n_s16(in[20], 2);
+ out[21] = vshlq_n_s16(in[21], 2);
+ out[22] = vshlq_n_s16(in[22], 2);
+ out[23] = vshlq_n_s16(in[23], 2);
+
+ out[24] = vshlq_n_s16(in[24], 2);
+ out[25] = vshlq_n_s16(in[25], 2);
+ out[26] = vshlq_n_s16(in[26], 2);
+ out[27] = vshlq_n_s16(in[27], 2);
+ out[28] = vshlq_n_s16(in[28], 2);
+ out[29] = vshlq_n_s16(in[29], 2);
+ out[30] = vshlq_n_s16(in[30], 2);
+ out[31] = vshlq_n_s16(in[31], 2);
+}
+
+static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ a[0] = vaddq_s16(in[0], in[15]);
+ a[1] = vaddq_s16(in[1], in[14]);
+ a[2] = vaddq_s16(in[2], in[13]);
+ a[3] = vaddq_s16(in[3], in[12]);
+ a[4] = vaddq_s16(in[4], in[11]);
+ a[5] = vaddq_s16(in[5], in[10]);
+ a[6] = vaddq_s16(in[6], in[9]);
+ a[7] = vaddq_s16(in[7], in[8]);
+
+ a[8] = vsubq_s16(in[7], in[8]);
+ a[9] = vsubq_s16(in[6], in[9]);
+ a[10] = vsubq_s16(in[5], in[10]);
+ a[11] = vsubq_s16(in[4], in[11]);
+ a[12] = vsubq_s16(in[3], in[12]);
+ a[13] = vsubq_s16(in[2], in[13]);
+ a[14] = vsubq_s16(in[1], in[14]);
+ a[15] = vsubq_s16(in[0], in[15]);
+
+ a[16] = in[16];
+ a[17] = in[17];
+ a[18] = in[18];
+ a[19] = in[19];
+
+ butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
+ &a[20]);
+ butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
+ &a[21]);
+ butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
+ &a[22]);
+ butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
+ &a[23]);
+
+ a[28] = in[28];
+ a[29] = in[29];
+ a[30] = in[30];
+ a[31] = in[31];
+
+ // Stage 3.
+ b[0] = vaddq_s16(a[0], a[7]);
+ b[1] = vaddq_s16(a[1], a[6]);
+ b[2] = vaddq_s16(a[2], a[5]);
+ b[3] = vaddq_s16(a[3], a[4]);
+
+ b[4] = vsubq_s16(a[3], a[4]);
+ b[5] = vsubq_s16(a[2], a[5]);
+ b[6] = vsubq_s16(a[1], a[6]);
+ b[7] = vsubq_s16(a[0], a[7]);
+
+ b[8] = a[8];
+ b[9] = a[9];
+
+ butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+ butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+ b[14] = a[14];
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(in[16], a[23]);
+ b[17] = vaddq_s16(in[17], a[22]);
+ b[18] = vaddq_s16(in[18], a[21]);
+ b[19] = vaddq_s16(in[19], a[20]);
+
+ b[20] = vsubq_s16(in[19], a[20]);
+ b[21] = vsubq_s16(in[18], a[21]);
+ b[22] = vsubq_s16(in[17], a[22]);
+ b[23] = vsubq_s16(in[16], a[23]);
+
+ b[24] = vsubq_s16(in[31], a[24]);
+ b[25] = vsubq_s16(in[30], a[25]);
+ b[26] = vsubq_s16(in[29], a[26]);
+ b[27] = vsubq_s16(in[28], a[27]);
+
+ b[28] = vaddq_s16(in[28], a[27]);
+ b[29] = vaddq_s16(in[29], a[26]);
+ b[30] = vaddq_s16(in[30], a[25]);
+ b[31] = vaddq_s16(in[31], a[24]);
+
+ // Stage 4.
+ a[0] = vaddq_s16(b[0], b[3]);
+ a[1] = vaddq_s16(b[1], b[2]);
+ a[2] = vsubq_s16(b[1], b[2]);
+ a[3] = vsubq_s16(b[0], b[3]);
+
+ a[4] = b[4];
+
+ butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+ a[7] = b[7];
+
+ a[8] = vaddq_s16(b[8], b[11]);
+ a[9] = vaddq_s16(b[9], b[10]);
+ a[10] = vsubq_s16(b[9], b[10]);
+ a[11] = vsubq_s16(b[8], b[11]);
+ a[12] = vsubq_s16(b[15], b[12]);
+ a[13] = vsubq_s16(b[14], b[13]);
+ a[14] = vaddq_s16(b[14], b[13]);
+ a[15] = vaddq_s16(b[15], b[12]);
+
+ a[16] = b[16];
+ a[17] = b[17];
+
+ butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
+ butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
+ butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
+ butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
+
+ a[22] = b[22];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[25] = b[25];
+
+ a[30] = b[30];
+ a[31] = b[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+ butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
+
+ b[4] = vaddq_s16(a[4], a[5]);
+ b[5] = vsubq_s16(a[4], a[5]);
+ b[6] = vsubq_s16(a[7], a[6]);
+ b[7] = vaddq_s16(a[7], a[6]);
+
+ b[8] = a[8];
+
+ butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
+ butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
+
+ b[11] = a[11];
+ b[12] = a[12];
+
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(a[19], a[16]);
+ b[17] = vaddq_s16(a[18], a[17]);
+ b[18] = vsubq_s16(a[17], a[18]);
+ b[19] = vsubq_s16(a[16], a[19]);
+ b[20] = vsubq_s16(a[23], a[20]);
+ b[21] = vsubq_s16(a[22], a[21]);
+ b[22] = vaddq_s16(a[21], a[22]);
+ b[23] = vaddq_s16(a[20], a[23]);
+ b[24] = vaddq_s16(a[27], a[24]);
+ b[25] = vaddq_s16(a[26], a[25]);
+ b[26] = vsubq_s16(a[25], a[26]);
+ b[27] = vsubq_s16(a[24], a[27]);
+ b[28] = vsubq_s16(a[31], a[28]);
+ b[29] = vsubq_s16(a[30], a[29]);
+ b[30] = vaddq_s16(a[29], a[30]);
+ b[31] = vaddq_s16(a[28], a[31]);
+
+ // Stage 6.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+
+ butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
+ butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
+
+ a[8] = vaddq_s16(b[8], b[9]);
+ a[9] = vsubq_s16(b[8], b[9]);
+ a[10] = vsubq_s16(b[11], b[10]);
+ a[11] = vaddq_s16(b[11], b[10]);
+ a[12] = vaddq_s16(b[12], b[13]);
+ a[13] = vsubq_s16(b[12], b[13]);
+ a[14] = vsubq_s16(b[15], b[14]);
+ a[15] = vaddq_s16(b[15], b[14]);
+
+ a[16] = b[16];
+ a[19] = b[19];
+ a[20] = b[20];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[27] = b[27];
+ a[28] = b[28];
+ a[31] = b[31];
+
+ butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
+ butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
+ butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
+
+ // Stage 7.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+ b[4] = a[4];
+ b[5] = a[5];
+ b[6] = a[6];
+ b[7] = a[7];
+
+ butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
+ butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
+ butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
+ butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
+
+ b[16] = vaddq_s16(a[16], a[17]);
+ b[17] = vsubq_s16(a[16], a[17]);
+ b[18] = vsubq_s16(a[19], a[18]);
+ b[19] = vaddq_s16(a[19], a[18]);
+ b[20] = vaddq_s16(a[20], a[21]);
+ b[21] = vsubq_s16(a[20], a[21]);
+ b[22] = vsubq_s16(a[23], a[22]);
+ b[23] = vaddq_s16(a[23], a[22]);
+ b[24] = vaddq_s16(a[24], a[25]);
+ b[25] = vsubq_s16(a[24], a[25]);
+ b[26] = vsubq_s16(a[27], a[26]);
+ b[27] = vaddq_s16(a[27], a[26]);
+ b[28] = vaddq_s16(a[28], a[29]);
+ b[29] = vsubq_s16(a[28], a[29]);
+ b[30] = vsubq_s16(a[31], a[30]);
+ b[31] = vaddq_s16(a[31], a[30]);
+
+ // Final stage.
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ out[0] = sub_round_shift_s16(b[0]);
+ out[16] = sub_round_shift_s16(b[1]);
+ out[8] = sub_round_shift_s16(b[2]);
+ out[24] = sub_round_shift_s16(b[3]);
+ out[4] = sub_round_shift_s16(b[4]);
+ out[20] = sub_round_shift_s16(b[5]);
+ out[12] = sub_round_shift_s16(b[6]);
+ out[28] = sub_round_shift_s16(b[7]);
+ out[2] = sub_round_shift_s16(b[8]);
+ out[18] = sub_round_shift_s16(b[9]);
+ out[10] = sub_round_shift_s16(b[10]);
+ out[26] = sub_round_shift_s16(b[11]);
+ out[6] = sub_round_shift_s16(b[12]);
+ out[22] = sub_round_shift_s16(b[13]);
+ out[14] = sub_round_shift_s16(b[14]);
+ out[30] = sub_round_shift_s16(b[15]);
+
+ butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
+ out[1] = sub_round_shift_s16(a[1]);
+ out[31] = sub_round_shift_s16(a[31]);
+
+ butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
+ out[17] = sub_round_shift_s16(a[17]);
+ out[15] = sub_round_shift_s16(a[15]);
+
+ butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
+ out[9] = sub_round_shift_s16(a[9]);
+ out[23] = sub_round_shift_s16(a[23]);
+
+ butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
+ out[25] = sub_round_shift_s16(a[25]);
+ out[7] = sub_round_shift_s16(a[7]);
+
+ butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
+ out[5] = sub_round_shift_s16(a[5]);
+ out[27] = sub_round_shift_s16(a[27]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
+ out[21] = sub_round_shift_s16(a[21]);
+ out[11] = sub_round_shift_s16(a[11]);
+
+ butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
+ out[13] = sub_round_shift_s16(a[13]);
+ out[19] = sub_round_shift_s16(a[19]);
+
+ butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
+ out[29] = sub_round_shift_s16(a[29]);
+ out[3] = sub_round_shift_s16(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element) \
+ do { \
+ dst##_lo[element] = src##_lo[element]; \
+ dst##_hi[element] = src##_hi[element]; \
+ } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
+ do { \
+ c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
+ c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+ } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+ do { \
+ temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
+ temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
+ c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
+ c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
+ } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
+ add_index, sub_index) \
+ do { \
+ butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+ &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
+ sub_index) \
+ do { \
+ butterfly_one_coeff_s32_fast( \
+ a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \
+ a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
+ right_constant, b, add_index, sub_index) \
+ do { \
+ butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
+ a##_lo[right_index], a##_hi[right_index], \
+ left_constant, right_constant, &b##_lo[add_index], \
+ &b##_hi[add_index], &b##_lo[sub_index], \
+ &b##_hi[sub_index]); \
+ } while (0)
+
+static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+ int32x4_t c_lo[32];
+ int32x4_t c_hi[32];
+ int32x4_t d_lo[32];
+ int32x4_t d_hi[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+ b[18] = a[18];
+ b[19] = a[19];
+
+ butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+ b[28] = a[28];
+ b[29] = a[29];
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 3. With extreme values for input this calculation rolls over int16_t.
+ // The sources for b[0] get added multiple times and, through testing, have
+ // been shown to overflow starting here.
+ ADD_S16_S32(b, 0, 7, c, 0);
+ ADD_S16_S32(b, 1, 6, c, 1);
+ ADD_S16_S32(b, 2, 5, c, 2);
+ ADD_S16_S32(b, 3, 4, c, 3);
+ SUB_S16_S32(b, 3, 4, c, 4);
+ SUB_S16_S32(b, 2, 5, c, 5);
+ SUB_S16_S32(b, 1, 6, c, 6);
+ SUB_S16_S32(b, 0, 7, c, 7);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+ BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ ADD_S16_S32(b, 16, 23, c, 16);
+ ADD_S16_S32(b, 17, 22, c, 17);
+ ADD_S16_S32(b, 18, 21, c, 18);
+ ADD_S16_S32(b, 19, 20, c, 19);
+ SUB_S16_S32(b, 19, 20, c, 20);
+ SUB_S16_S32(b, 18, 21, c, 21);
+ SUB_S16_S32(b, 17, 22, c, 22);
+ SUB_S16_S32(b, 16, 23, c, 23);
+ SUB_S16_S32(b, 31, 24, c, 24);
+ SUB_S16_S32(b, 30, 25, c, 25);
+ SUB_S16_S32(b, 29, 26, c, 26);
+ SUB_S16_S32(b, 28, 27, c, 27);
+ ADD_S16_S32(b, 28, 27, c, 28);
+ ADD_S16_S32(b, 29, 26, c, 29);
+ ADD_S16_S32(b, 30, 25, c, 30);
+ ADD_S16_S32(b, 31, 24, c, 31);
+
+ // Stage 4.
+ ADD_S32(c, 0, 3, d, 0);
+ ADD_S32(c, 1, 2, d, 1);
+ SUB_S32(c, 1, 2, d, 2);
+ SUB_S32(c, 0, 3, d, 3);
+
+ PASS_THROUGH(c, d, 4);
+
+ BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+ PASS_THROUGH(c, d, 7);
+
+ ADDW_S16_S32(c, 11, a, 8, d, 8);
+ ADDW_S16_S32(c, 10, a, 9, d, 9);
+ SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+ SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+ SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+ SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+ ADDW_S16_S32(c, 13, b, 14, d, 14);
+ ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 17);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
+
+ PASS_THROUGH(c, d, 22);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 25);
+
+ PASS_THROUGH(c, d, 30);
+ PASS_THROUGH(c, d, 31);
+
+ // Stage 5.
+ BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+ BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
+
+ ADD_S32(d, 4, 5, c, 4);
+ SUB_S32(d, 4, 5, c, 5);
+ SUB_S32(d, 7, 6, c, 6);
+ ADD_S32(d, 7, 6, c, 7);
+
+ PASS_THROUGH(d, c, 8);
+
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
+
+ PASS_THROUGH(d, c, 11);
+ PASS_THROUGH(d, c, 12);
+ PASS_THROUGH(d, c, 15);
+
+ ADD_S32(d, 16, 19, c, 16);
+ ADD_S32(d, 17, 18, c, 17);
+ SUB_S32(d, 17, 18, c, 18);
+ SUB_S32(d, 16, 19, c, 19);
+ SUB_S32(d, 23, 20, c, 20);
+ SUB_S32(d, 22, 21, c, 21);
+ ADD_S32(d, 22, 21, c, 22);
+ ADD_S32(d, 23, 20, c, 23);
+ ADD_S32(d, 24, 27, c, 24);
+ ADD_S32(d, 25, 26, c, 25);
+ SUB_S32(d, 25, 26, c, 26);
+ SUB_S32(d, 24, 27, c, 27);
+ SUB_S32(d, 31, 28, c, 28);
+ SUB_S32(d, 30, 29, c, 29);
+ ADD_S32(d, 30, 29, c, 30);
+ ADD_S32(d, 31, 28, c, 31);
+
+ // Stage 6.
+ PASS_THROUGH(c, d, 0);
+ PASS_THROUGH(c, d, 1);
+ PASS_THROUGH(c, d, 2);
+ PASS_THROUGH(c, d, 3);
+
+ BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
+ BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
+
+ ADD_S32(c, 8, 9, d, 8);
+ SUB_S32(c, 8, 9, d, 9);
+ SUB_S32(c, 11, 10, d, 10);
+ ADD_S32(c, 11, 10, d, 11);
+ ADD_S32(c, 12, 13, d, 12);
+ SUB_S32(c, 12, 13, d, 13);
+ SUB_S32(c, 15, 14, d, 14);
+ ADD_S32(c, 15, 14, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 19);
+ PASS_THROUGH(c, d, 20);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 27);
+ PASS_THROUGH(c, d, 28);
+ PASS_THROUGH(c, d, 31);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
+
+ // Stage 7.
+ PASS_THROUGH(d, c, 0);
+ PASS_THROUGH(d, c, 1);
+ PASS_THROUGH(d, c, 2);
+ PASS_THROUGH(d, c, 3);
+ PASS_THROUGH(d, c, 4);
+ PASS_THROUGH(d, c, 5);
+ PASS_THROUGH(d, c, 6);
+ PASS_THROUGH(d, c, 7);
+
+ BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
+ BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
+
+ ADD_S32(d, 16, 17, c, 16);
+ SUB_S32(d, 16, 17, c, 17);
+ SUB_S32(d, 19, 18, c, 18);
+ ADD_S32(d, 19, 18, c, 19);
+ ADD_S32(d, 20, 21, c, 20);
+ SUB_S32(d, 20, 21, c, 21);
+ SUB_S32(d, 23, 22, c, 22);
+ ADD_S32(d, 23, 22, c, 23);
+ ADD_S32(d, 24, 25, c, 24);
+ SUB_S32(d, 24, 25, c, 25);
+ SUB_S32(d, 27, 26, c, 26);
+ ADD_S32(d, 27, 26, c, 27);
+ ADD_S32(d, 28, 29, c, 28);
+ SUB_S32(d, 28, 29, c, 29);
+ SUB_S32(d, 31, 30, c, 30);
+ ADD_S32(d, 31, 30, c, 31);
+
+ // Final stage.
+ // Roll rounding into this function so we can pass back int16x8.
+
+ out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
+ out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
+
+ out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
+ out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
+ out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
+ out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
+ out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
+
+ out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
+ out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
+ out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
+ out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
+
+ out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
+ out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
+ out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
+ out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
+ out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
+ out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
+ out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
+ out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
+ out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
+ out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
+ out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
+
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
+ out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
+ out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
+
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
+ out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
+ out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
+
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
+ out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
+ out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
+
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
+ out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
+ out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
+
+ BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
+ out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
+ out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
+}
+
+static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ // For the "rd" version, all the values are rounded down after stage 2 to keep
+ // the values in 16 bits.
+ b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+ b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+ b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+ b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+ b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+ b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+ b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+ b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+ b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+ b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+ b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+ b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+ b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+ b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+ b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+ b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+ b[16] = add_round_shift_s16(a[16]);
+ b[17] = add_round_shift_s16(a[17]);
+ b[18] = add_round_shift_s16(a[18]);
+ b[19] = add_round_shift_s16(a[19]);
+
+ butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+ b[20] = add_round_shift_s16(b[20]);
+ b[21] = add_round_shift_s16(b[21]);
+ b[22] = add_round_shift_s16(b[22]);
+ b[23] = add_round_shift_s16(b[23]);
+ b[24] = add_round_shift_s16(b[24]);
+ b[25] = add_round_shift_s16(b[25]);
+ b[26] = add_round_shift_s16(b[26]);
+ b[27] = add_round_shift_s16(b[27]);
+
+ b[28] = add_round_shift_s16(a[28]);
+ b[29] = add_round_shift_s16(a[29]);
+ b[30] = add_round_shift_s16(a[30]);
+ b[31] = add_round_shift_s16(a[31]);
+
+ // Stage 3.
+ a[0] = vaddq_s16(b[0], b[7]);
+ a[1] = vaddq_s16(b[1], b[6]);
+ a[2] = vaddq_s16(b[2], b[5]);
+ a[3] = vaddq_s16(b[3], b[4]);
+
+ a[4] = vsubq_s16(b[3], b[4]);
+ a[5] = vsubq_s16(b[2], b[5]);
+ a[6] = vsubq_s16(b[1], b[6]);
+ a[7] = vsubq_s16(b[0], b[7]);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+ butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[16], b[23]);
+ a[17] = vaddq_s16(b[17], b[22]);
+ a[18] = vaddq_s16(b[18], b[21]);
+ a[19] = vaddq_s16(b[19], b[20]);
+
+ a[20] = vsubq_s16(b[19], b[20]);
+ a[21] = vsubq_s16(b[18], b[21]);
+ a[22] = vsubq_s16(b[17], b[22]);
+ a[23] = vsubq_s16(b[16], b[23]);
+
+ a[24] = vsubq_s16(b[31], b[24]);
+ a[25] = vsubq_s16(b[30], b[25]);
+ a[26] = vsubq_s16(b[29], b[26]);
+ a[27] = vsubq_s16(b[28], b[27]);
+
+ a[28] = vaddq_s16(b[28], b[27]);
+ a[29] = vaddq_s16(b[29], b[26]);
+ a[30] = vaddq_s16(b[30], b[25]);
+ a[31] = vaddq_s16(b[31], b[24]);
+
+ // Stage 4.
+ b[0] = vaddq_s16(a[0], a[3]);
+ b[1] = vaddq_s16(a[1], a[2]);
+ b[2] = vsubq_s16(a[1], a[2]);
+ b[3] = vsubq_s16(a[0], a[3]);
+
+ b[4] = a[4];
+
+ butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+ b[7] = a[7];
+
+ b[8] = vaddq_s16(a[8], a[11]);
+ b[9] = vaddq_s16(a[9], a[10]);
+ b[10] = vsubq_s16(a[9], a[10]);
+ b[11] = vsubq_s16(a[8], a[11]);
+ b[12] = vsubq_s16(a[15], a[12]);
+ b[13] = vsubq_s16(a[14], a[13]);
+ b[14] = vaddq_s16(a[14], a[13]);
+ b[15] = vaddq_s16(a[15], a[12]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+
+ butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
+ butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
+ butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
+ butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
+
+ b[22] = a[22];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[25] = a[25];
+
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+ butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
+
+ a[4] = vaddq_s16(b[4], b[5]);
+ a[5] = vsubq_s16(b[4], b[5]);
+ a[6] = vsubq_s16(b[7], b[6]);
+ a[7] = vaddq_s16(b[7], b[6]);
+
+ a[8] = b[8];
+
+ butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
+ butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
+
+ a[11] = b[11];
+ a[12] = b[12];
+
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[19], b[16]);
+ a[17] = vaddq_s16(b[18], b[17]);
+ a[18] = vsubq_s16(b[17], b[18]);
+ a[19] = vsubq_s16(b[16], b[19]);
+ a[20] = vsubq_s16(b[23], b[20]);
+ a[21] = vsubq_s16(b[22], b[21]);
+ a[22] = vaddq_s16(b[21], b[22]);
+ a[23] = vaddq_s16(b[20], b[23]);
+ a[24] = vaddq_s16(b[27], b[24]);
+ a[25] = vaddq_s16(b[26], b[25]);
+ a[26] = vsubq_s16(b[25], b[26]);
+ a[27] = vsubq_s16(b[24], b[27]);
+ a[28] = vsubq_s16(b[31], b[28]);
+ a[29] = vsubq_s16(b[30], b[29]);
+ a[30] = vaddq_s16(b[29], b[30]);
+ a[31] = vaddq_s16(b[28], b[31]);
+
+ // Stage 6.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+
+ butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
+ butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
+
+ b[8] = vaddq_s16(a[8], a[9]);
+ b[9] = vsubq_s16(a[8], a[9]);
+ b[10] = vsubq_s16(a[11], a[10]);
+ b[11] = vaddq_s16(a[11], a[10]);
+ b[12] = vaddq_s16(a[12], a[13]);
+ b[13] = vsubq_s16(a[12], a[13]);
+ b[14] = vsubq_s16(a[15], a[14]);
+ b[15] = vaddq_s16(a[15], a[14]);
+
+ b[16] = a[16];
+ b[19] = a[19];
+ b[20] = a[20];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[27] = a[27];
+ b[28] = a[28];
+ b[31] = a[31];
+
+ butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
+ butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
+
+ butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
+ butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
+
+ // Stage 7.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+ a[4] = b[4];
+ a[5] = b[5];
+ a[6] = b[6];
+ a[7] = b[7];
+
+ butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
+ butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
+ butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
+ butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
+
+ a[16] = vaddq_s16(b[16], b[17]);
+ a[17] = vsubq_s16(b[16], b[17]);
+ a[18] = vsubq_s16(b[19], b[18]);
+ a[19] = vaddq_s16(b[19], b[18]);
+ a[20] = vaddq_s16(b[20], b[21]);
+ a[21] = vsubq_s16(b[20], b[21]);
+ a[22] = vsubq_s16(b[23], b[22]);
+ a[23] = vaddq_s16(b[23], b[22]);
+ a[24] = vaddq_s16(b[24], b[25]);
+ a[25] = vsubq_s16(b[24], b[25]);
+ a[26] = vsubq_s16(b[27], b[26]);
+ a[27] = vaddq_s16(b[27], b[26]);
+ a[28] = vaddq_s16(b[28], b[29]);
+ a[29] = vsubq_s16(b[28], b[29]);
+ a[30] = vsubq_s16(b[31], b[30]);
+ a[31] = vaddq_s16(b[31], b[30]);
+
+ // Final stage.
+ out[0] = a[0];
+ out[16] = a[1];
+ out[8] = a[2];
+ out[24] = a[3];
+ out[4] = a[4];
+ out[20] = a[5];
+ out[12] = a[6];
+ out[28] = a[7];
+ out[2] = a[8];
+ out[18] = a[9];
+ out[10] = a[10];
+ out[26] = a[11];
+ out[6] = a[12];
+ out[22] = a[13];
+ out[14] = a[14];
+ out[30] = a[15];
+
+ butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
+ butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
+ &out[15]);
+ butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
+ butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
+ butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
+ butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
+ &out[11]);
+ butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
+ &out[19]);
+ butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Store 32 32x4 vectors, assuming stride == 32.
+static INLINE void store32x32_s32(
+ tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
+ const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
+ const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
+ const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
+ int i;
+ for (i = 0; i < 32; i++) {
+ vst1q_s32(a, l1[i]);
+ vst1q_s32(a + 4, r1[i]);
+ vst1q_s32(a + 8, l2[i]);
+ vst1q_s32(a + 12, r2[i]);
+ vst1q_s32(a + 16, l3[i]);
+ vst1q_s32(a + 20, r3[i]);
+ vst1q_s32(a + 24, l4[i]);
+ vst1q_s32(a + 28, r4[i]);
+ a += 32;
+ }
+}
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
+ int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+ left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+ left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+ left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+ left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+ left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+ left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+ left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+ left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+ left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
+ left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
+ left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
+ left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
+ left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
+ left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
+ left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
+ left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
+ left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
+ left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
+ left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
+ left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
+ left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
+ left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
+ left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
+ left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
+
+ right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+ right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+ right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+ right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+ right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+ right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+ right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+ right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+ right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+ right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
+ right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
+ right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
+ right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
+ right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
+ right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
+ right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
+ right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
+ right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
+ right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
+ right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
+ right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
+ right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
+ right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
+ right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
+ right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
+ int32x4_t *a_right /*[32]*/,
+ int32x4_t *b_left /*[32]*/,
+ int32x4_t *b_right /*[32]*/) {
+ // Stage 1. Done as part of the load for the first pass.
+ b_left[0] = vaddq_s32(a_left[0], a_left[31]);
+ b_left[1] = vaddq_s32(a_left[1], a_left[30]);
+ b_left[2] = vaddq_s32(a_left[2], a_left[29]);
+ b_left[3] = vaddq_s32(a_left[3], a_left[28]);
+ b_left[4] = vaddq_s32(a_left[4], a_left[27]);
+ b_left[5] = vaddq_s32(a_left[5], a_left[26]);
+ b_left[6] = vaddq_s32(a_left[6], a_left[25]);
+ b_left[7] = vaddq_s32(a_left[7], a_left[24]);
+ b_left[8] = vaddq_s32(a_left[8], a_left[23]);
+ b_left[9] = vaddq_s32(a_left[9], a_left[22]);
+ b_left[10] = vaddq_s32(a_left[10], a_left[21]);
+ b_left[11] = vaddq_s32(a_left[11], a_left[20]);
+ b_left[12] = vaddq_s32(a_left[12], a_left[19]);
+ b_left[13] = vaddq_s32(a_left[13], a_left[18]);
+ b_left[14] = vaddq_s32(a_left[14], a_left[17]);
+ b_left[15] = vaddq_s32(a_left[15], a_left[16]);
+
+ b_right[0] = vaddq_s32(a_right[0], a_right[31]);
+ b_right[1] = vaddq_s32(a_right[1], a_right[30]);
+ b_right[2] = vaddq_s32(a_right[2], a_right[29]);
+ b_right[3] = vaddq_s32(a_right[3], a_right[28]);
+ b_right[4] = vaddq_s32(a_right[4], a_right[27]);
+ b_right[5] = vaddq_s32(a_right[5], a_right[26]);
+ b_right[6] = vaddq_s32(a_right[6], a_right[25]);
+ b_right[7] = vaddq_s32(a_right[7], a_right[24]);
+ b_right[8] = vaddq_s32(a_right[8], a_right[23]);
+ b_right[9] = vaddq_s32(a_right[9], a_right[22]);
+ b_right[10] = vaddq_s32(a_right[10], a_right[21]);
+ b_right[11] = vaddq_s32(a_right[11], a_right[20]);
+ b_right[12] = vaddq_s32(a_right[12], a_right[19]);
+ b_right[13] = vaddq_s32(a_right[13], a_right[18]);
+ b_right[14] = vaddq_s32(a_right[14], a_right[17]);
+ b_right[15] = vaddq_s32(a_right[15], a_right[16]);
+
+ b_left[16] = vsubq_s32(a_left[15], a_left[16]);
+ b_left[17] = vsubq_s32(a_left[14], a_left[17]);
+ b_left[18] = vsubq_s32(a_left[13], a_left[18]);
+ b_left[19] = vsubq_s32(a_left[12], a_left[19]);
+ b_left[20] = vsubq_s32(a_left[11], a_left[20]);
+ b_left[21] = vsubq_s32(a_left[10], a_left[21]);
+ b_left[22] = vsubq_s32(a_left[9], a_left[22]);
+ b_left[23] = vsubq_s32(a_left[8], a_left[23]);
+ b_left[24] = vsubq_s32(a_left[7], a_left[24]);
+ b_left[25] = vsubq_s32(a_left[6], a_left[25]);
+ b_left[26] = vsubq_s32(a_left[5], a_left[26]);
+ b_left[27] = vsubq_s32(a_left[4], a_left[27]);
+ b_left[28] = vsubq_s32(a_left[3], a_left[28]);
+ b_left[29] = vsubq_s32(a_left[2], a_left[29]);
+ b_left[30] = vsubq_s32(a_left[1], a_left[30]);
+ b_left[31] = vsubq_s32(a_left[0], a_left[31]);
+
+ b_right[16] = vsubq_s32(a_right[15], a_right[16]);
+ b_right[17] = vsubq_s32(a_right[14], a_right[17]);
+ b_right[18] = vsubq_s32(a_right[13], a_right[18]);
+ b_right[19] = vsubq_s32(a_right[12], a_right[19]);
+ b_right[20] = vsubq_s32(a_right[11], a_right[20]);
+ b_right[21] = vsubq_s32(a_right[10], a_right[21]);
+ b_right[22] = vsubq_s32(a_right[9], a_right[22]);
+ b_right[23] = vsubq_s32(a_right[8], a_right[23]);
+ b_right[24] = vsubq_s32(a_right[7], a_right[24]);
+ b_right[25] = vsubq_s32(a_right[6], a_right[25]);
+ b_right[26] = vsubq_s32(a_right[5], a_right[26]);
+ b_right[27] = vsubq_s32(a_right[4], a_right[27]);
+ b_right[28] = vsubq_s32(a_right[3], a_right[28]);
+ b_right[29] = vsubq_s32(a_right[2], a_right[29]);
+ b_right[30] = vsubq_s32(a_right[1], a_right[30]);
+ b_right[31] = vsubq_s32(a_right[0], a_right[31]);
+}
+
+static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+ left[0] = add_round_shift_s32(left[0]);
+ left[1] = add_round_shift_s32(left[1]);
+ left[2] = add_round_shift_s32(left[2]);
+ left[3] = add_round_shift_s32(left[3]);
+ left[4] = add_round_shift_s32(left[4]);
+ left[5] = add_round_shift_s32(left[5]);
+ left[6] = add_round_shift_s32(left[6]);
+ left[7] = add_round_shift_s32(left[7]);
+ left[8] = add_round_shift_s32(left[8]);
+ left[9] = add_round_shift_s32(left[9]);
+ left[10] = add_round_shift_s32(left[10]);
+ left[11] = add_round_shift_s32(left[11]);
+ left[12] = add_round_shift_s32(left[12]);
+ left[13] = add_round_shift_s32(left[13]);
+ left[14] = add_round_shift_s32(left[14]);
+ left[15] = add_round_shift_s32(left[15]);
+ left[16] = add_round_shift_s32(left[16]);
+ left[17] = add_round_shift_s32(left[17]);
+ left[18] = add_round_shift_s32(left[18]);
+ left[19] = add_round_shift_s32(left[19]);
+ left[20] = add_round_shift_s32(left[20]);
+ left[21] = add_round_shift_s32(left[21]);
+ left[22] = add_round_shift_s32(left[22]);
+ left[23] = add_round_shift_s32(left[23]);
+ left[24] = add_round_shift_s32(left[24]);
+ left[25] = add_round_shift_s32(left[25]);
+ left[26] = add_round_shift_s32(left[26]);
+ left[27] = add_round_shift_s32(left[27]);
+ left[28] = add_round_shift_s32(left[28]);
+ left[29] = add_round_shift_s32(left[29]);
+ left[30] = add_round_shift_s32(left[30]);
+ left[31] = add_round_shift_s32(left[31]);
+
+ right[0] = add_round_shift_s32(right[0]);
+ right[1] = add_round_shift_s32(right[1]);
+ right[2] = add_round_shift_s32(right[2]);
+ right[3] = add_round_shift_s32(right[3]);
+ right[4] = add_round_shift_s32(right[4]);
+ right[5] = add_round_shift_s32(right[5]);
+ right[6] = add_round_shift_s32(right[6]);
+ right[7] = add_round_shift_s32(right[7]);
+ right[8] = add_round_shift_s32(right[8]);
+ right[9] = add_round_shift_s32(right[9]);
+ right[10] = add_round_shift_s32(right[10]);
+ right[11] = add_round_shift_s32(right[11]);
+ right[12] = add_round_shift_s32(right[12]);
+ right[13] = add_round_shift_s32(right[13]);
+ right[14] = add_round_shift_s32(right[14]);
+ right[15] = add_round_shift_s32(right[15]);
+ right[16] = add_round_shift_s32(right[16]);
+ right[17] = add_round_shift_s32(right[17]);
+ right[18] = add_round_shift_s32(right[18]);
+ right[19] = add_round_shift_s32(right[19]);
+ right[20] = add_round_shift_s32(right[20]);
+ right[21] = add_round_shift_s32(right[21]);
+ right[22] = add_round_shift_s32(right[22]);
+ right[23] = add_round_shift_s32(right[23]);
+ right[24] = add_round_shift_s32(right[24]);
+ right[25] = add_round_shift_s32(right[25]);
+ right[26] = add_round_shift_s32(right[26]);
+ right[27] = add_round_shift_s32(right[27]);
+ right[28] = add_round_shift_s32(right[28]);
+ right[29] = add_round_shift_s32(right[29]);
+ right[30] = add_round_shift_s32(right[30]);
+ right[31] = add_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+ left[0] = sub_round_shift_s32(left[0]);
+ left[1] = sub_round_shift_s32(left[1]);
+ left[2] = sub_round_shift_s32(left[2]);
+ left[3] = sub_round_shift_s32(left[3]);
+ left[4] = sub_round_shift_s32(left[4]);
+ left[5] = sub_round_shift_s32(left[5]);
+ left[6] = sub_round_shift_s32(left[6]);
+ left[7] = sub_round_shift_s32(left[7]);
+ left[8] = sub_round_shift_s32(left[8]);
+ left[9] = sub_round_shift_s32(left[9]);
+ left[10] = sub_round_shift_s32(left[10]);
+ left[11] = sub_round_shift_s32(left[11]);
+ left[12] = sub_round_shift_s32(left[12]);
+ left[13] = sub_round_shift_s32(left[13]);
+ left[14] = sub_round_shift_s32(left[14]);
+ left[15] = sub_round_shift_s32(left[15]);
+ left[16] = sub_round_shift_s32(left[16]);
+ left[17] = sub_round_shift_s32(left[17]);
+ left[18] = sub_round_shift_s32(left[18]);
+ left[19] = sub_round_shift_s32(left[19]);
+ left[20] = sub_round_shift_s32(left[20]);
+ left[21] = sub_round_shift_s32(left[21]);
+ left[22] = sub_round_shift_s32(left[22]);
+ left[23] = sub_round_shift_s32(left[23]);
+ left[24] = sub_round_shift_s32(left[24]);
+ left[25] = sub_round_shift_s32(left[25]);
+ left[26] = sub_round_shift_s32(left[26]);
+ left[27] = sub_round_shift_s32(left[27]);
+ left[28] = sub_round_shift_s32(left[28]);
+ left[29] = sub_round_shift_s32(left[29]);
+ left[30] = sub_round_shift_s32(left[30]);
+ left[31] = sub_round_shift_s32(left[31]);
+
+ right[0] = sub_round_shift_s32(right[0]);
+ right[1] = sub_round_shift_s32(right[1]);
+ right[2] = sub_round_shift_s32(right[2]);
+ right[3] = sub_round_shift_s32(right[3]);
+ right[4] = sub_round_shift_s32(right[4]);
+ right[5] = sub_round_shift_s32(right[5]);
+ right[6] = sub_round_shift_s32(right[6]);
+ right[7] = sub_round_shift_s32(right[7]);
+ right[8] = sub_round_shift_s32(right[8]);
+ right[9] = sub_round_shift_s32(right[9]);
+ right[10] = sub_round_shift_s32(right[10]);
+ right[11] = sub_round_shift_s32(right[11]);
+ right[12] = sub_round_shift_s32(right[12]);
+ right[13] = sub_round_shift_s32(right[13]);
+ right[14] = sub_round_shift_s32(right[14]);
+ right[15] = sub_round_shift_s32(right[15]);
+ right[16] = sub_round_shift_s32(right[16]);
+ right[17] = sub_round_shift_s32(right[17]);
+ right[18] = sub_round_shift_s32(right[18]);
+ right[19] = sub_round_shift_s32(right[19]);
+ right[20] = sub_round_shift_s32(right[20]);
+ right[21] = sub_round_shift_s32(right[21]);
+ right[22] = sub_round_shift_s32(right[22]);
+ right[23] = sub_round_shift_s32(right[23]);
+ right[24] = sub_round_shift_s32(right[24]);
+ right[25] = sub_round_shift_s32(right[25]);
+ right[26] = sub_round_shift_s32(right[26]);
+ right[27] = sub_round_shift_s32(right[27]);
+ right[28] = sub_round_shift_s32(right[28]);
+ right[29] = sub_round_shift_s32(right[29]);
+ right[30] = sub_round_shift_s32(right[30]);
+ right[31] = sub_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ al[0] = vaddq_s32(left[0], left[15]);
+ ar[0] = vaddq_s32(right[0], right[15]);
+ al[1] = vaddq_s32(left[1], left[14]);
+ ar[1] = vaddq_s32(right[1], right[14]);
+ al[2] = vaddq_s32(left[2], left[13]);
+ ar[2] = vaddq_s32(right[2], right[13]);
+ al[3] = vaddq_s32(left[3], left[12]);
+ ar[3] = vaddq_s32(right[3], right[12]);
+ al[4] = vaddq_s32(left[4], left[11]);
+ ar[4] = vaddq_s32(right[4], right[11]);
+ al[5] = vaddq_s32(left[5], left[10]);
+ ar[5] = vaddq_s32(right[5], right[10]);
+ al[6] = vaddq_s32(left[6], left[9]);
+ ar[6] = vaddq_s32(right[6], right[9]);
+ al[7] = vaddq_s32(left[7], left[8]);
+ ar[7] = vaddq_s32(right[7], right[8]);
+
+ al[8] = vsubq_s32(left[7], left[8]);
+ ar[8] = vsubq_s32(right[7], right[8]);
+ al[9] = vsubq_s32(left[6], left[9]);
+ ar[9] = vsubq_s32(right[6], right[9]);
+ al[10] = vsubq_s32(left[5], left[10]);
+ ar[10] = vsubq_s32(right[5], right[10]);
+ al[11] = vsubq_s32(left[4], left[11]);
+ ar[11] = vsubq_s32(right[4], right[11]);
+ al[12] = vsubq_s32(left[3], left[12]);
+ ar[12] = vsubq_s32(right[3], right[12]);
+ al[13] = vsubq_s32(left[2], left[13]);
+ ar[13] = vsubq_s32(right[2], right[13]);
+ al[14] = vsubq_s32(left[1], left[14]);
+ ar[14] = vsubq_s32(right[1], right[14]);
+ al[15] = vsubq_s32(left[0], left[15]);
+ ar[15] = vsubq_s32(right[0], right[15]);
+
+ al[16] = left[16];
+ ar[16] = right[16];
+ al[17] = left[17];
+ ar[17] = right[17];
+ al[18] = left[18];
+ ar[18] = right[18];
+ al[19] = left[19];
+ ar[19] = right[19];
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[28] = left[28];
+ ar[28] = right[28];
+ al[29] = left[29];
+ ar[29] = right[29];
+ al[30] = left[30];
+ ar[30] = right[30];
+ al[31] = left[31];
+ ar[31] = right[31];
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(left[16], al[23]);
+ br[16] = vaddq_s32(right[16], ar[23]);
+ bl[17] = vaddq_s32(left[17], al[22]);
+ br[17] = vaddq_s32(right[17], ar[22]);
+ bl[18] = vaddq_s32(left[18], al[21]);
+ br[18] = vaddq_s32(right[18], ar[21]);
+ bl[19] = vaddq_s32(left[19], al[20]);
+ br[19] = vaddq_s32(right[19], ar[20]);
+
+ bl[20] = vsubq_s32(left[19], al[20]);
+ br[20] = vsubq_s32(right[19], ar[20]);
+ bl[21] = vsubq_s32(left[18], al[21]);
+ br[21] = vsubq_s32(right[18], ar[21]);
+ bl[22] = vsubq_s32(left[17], al[22]);
+ br[22] = vsubq_s32(right[17], ar[22]);
+ bl[23] = vsubq_s32(left[16], al[23]);
+ br[23] = vsubq_s32(right[16], ar[23]);
+
+ bl[24] = vsubq_s32(left[31], al[24]);
+ br[24] = vsubq_s32(right[31], ar[24]);
+ bl[25] = vsubq_s32(left[30], al[25]);
+ br[25] = vsubq_s32(right[30], ar[25]);
+ bl[26] = vsubq_s32(left[29], al[26]);
+ br[26] = vsubq_s32(right[29], ar[26]);
+ bl[27] = vsubq_s32(left[28], al[27]);
+ br[27] = vsubq_s32(right[28], ar[27]);
+
+ bl[28] = vaddq_s32(left[28], al[27]);
+ br[28] = vaddq_s32(right[28], ar[27]);
+ bl[29] = vaddq_s32(left[29], al[26]);
+ br[29] = vaddq_s32(right[29], ar[26]);
+ bl[30] = vaddq_s32(left[30], al[25]);
+ br[30] = vaddq_s32(right[30], ar[25]);
+ bl[31] = vaddq_s32(left[31], al[24]);
+ br[31] = vaddq_s32(right[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18],
+ &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19],
+ &ar[19]);
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+ cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+ &al[20], &ar[20]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+ cospi_24_64, &bl[2], &br[2], &bl[3],
+ &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+ cospi_24_64, &bl[14], &br[14], &bl[9],
+ &br[9]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+ &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+ cospi_28_64, &al[4], &ar[4], &al[7],
+ &ar[7]);
+ butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+ cospi_12_64, &al[5], &ar[5], &al[6],
+ &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17],
+ &ar[17]);
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+ cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+ &al[18], &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_20_64, cospi_12_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_12_64, -cospi_20_64, &al[25],
+ &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+ cospi_30_64, &bl[8], &br[8], &bl[15],
+ &br[15]);
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14],
+ &br[14]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_10_64, cospi_22_64, &bl[10], &br[10],
+ &bl[13], &br[13]);
+ butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+ cospi_26_64, cospi_6_64, &bl[11], &br[11],
+ &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31],
+ &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+ cospi_17_64, cospi_15_64, &al[17], &ar[17],
+ &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23],
+ &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+ cospi_25_64, cospi_7_64, &al[25], &ar[25],
+ &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27],
+ &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_21_64, cospi_11_64, &al[21], &ar[21],
+ &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_13_64, cospi_19_64, &al[13], &ar[13],
+ &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+ cospi_29_64, cospi_3_64, &al[29], &ar[29],
+ &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ al[0] = vaddq_s32(left[0], left[15]);
+ ar[0] = vaddq_s32(right[0], right[15]);
+ al[1] = vaddq_s32(left[1], left[14]);
+ ar[1] = vaddq_s32(right[1], right[14]);
+ al[2] = vaddq_s32(left[2], left[13]);
+ ar[2] = vaddq_s32(right[2], right[13]);
+ al[3] = vaddq_s32(left[3], left[12]);
+ ar[3] = vaddq_s32(right[3], right[12]);
+ al[4] = vaddq_s32(left[4], left[11]);
+ ar[4] = vaddq_s32(right[4], right[11]);
+ al[5] = vaddq_s32(left[5], left[10]);
+ ar[5] = vaddq_s32(right[5], right[10]);
+ al[6] = vaddq_s32(left[6], left[9]);
+ ar[6] = vaddq_s32(right[6], right[9]);
+ al[7] = vaddq_s32(left[7], left[8]);
+ ar[7] = vaddq_s32(right[7], right[8]);
+
+ al[8] = vsubq_s32(left[7], left[8]);
+ ar[8] = vsubq_s32(right[7], right[8]);
+ al[9] = vsubq_s32(left[6], left[9]);
+ ar[9] = vsubq_s32(right[6], right[9]);
+ al[10] = vsubq_s32(left[5], left[10]);
+ ar[10] = vsubq_s32(right[5], right[10]);
+ al[11] = vsubq_s32(left[4], left[11]);
+ ar[11] = vsubq_s32(right[4], right[11]);
+ al[12] = vsubq_s32(left[3], left[12]);
+ ar[12] = vsubq_s32(right[3], right[12]);
+ al[13] = vsubq_s32(left[2], left[13]);
+ ar[13] = vsubq_s32(right[2], right[13]);
+ al[14] = vsubq_s32(left[1], left[14]);
+ ar[14] = vsubq_s32(right[1], right[14]);
+ al[15] = vsubq_s32(left[0], left[15]);
+ ar[15] = vsubq_s32(right[0], right[15]);
+
+ al[16] = left[16];
+ ar[16] = right[16];
+ al[17] = left[17];
+ ar[17] = right[17];
+ al[18] = left[18];
+ ar[18] = right[18];
+ al[19] = left[19];
+ ar[19] = right[19];
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[28] = left[28];
+ ar[28] = right[28];
+ al[29] = left[29];
+ ar[29] = right[29];
+ al[30] = left[30];
+ ar[30] = right[30];
+ al[31] = left[31];
+ ar[31] = right[31];
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(left[16], al[23]);
+ br[16] = vaddq_s32(right[16], ar[23]);
+ bl[17] = vaddq_s32(left[17], al[22]);
+ br[17] = vaddq_s32(right[17], ar[22]);
+ bl[18] = vaddq_s32(left[18], al[21]);
+ br[18] = vaddq_s32(right[18], ar[21]);
+ bl[19] = vaddq_s32(left[19], al[20]);
+ br[19] = vaddq_s32(right[19], ar[20]);
+
+ bl[20] = vsubq_s32(left[19], al[20]);
+ br[20] = vsubq_s32(right[19], ar[20]);
+ bl[21] = vsubq_s32(left[18], al[21]);
+ br[21] = vsubq_s32(right[18], ar[21]);
+ bl[22] = vsubq_s32(left[17], al[22]);
+ br[22] = vsubq_s32(right[17], ar[22]);
+ bl[23] = vsubq_s32(left[16], al[23]);
+ br[23] = vsubq_s32(right[16], ar[23]);
+
+ bl[24] = vsubq_s32(left[31], al[24]);
+ br[24] = vsubq_s32(right[31], ar[24]);
+ bl[25] = vsubq_s32(left[30], al[25]);
+ br[25] = vsubq_s32(right[30], ar[25]);
+ bl[26] = vsubq_s32(left[29], al[26]);
+ br[26] = vsubq_s32(right[29], ar[26]);
+ bl[27] = vsubq_s32(left[28], al[27]);
+ br[27] = vsubq_s32(right[28], ar[27]);
+
+ bl[28] = vaddq_s32(left[28], al[27]);
+ br[28] = vaddq_s32(right[28], ar[27]);
+ bl[29] = vaddq_s32(left[29], al[26]);
+ br[29] = vaddq_s32(right[29], ar[26]);
+ bl[30] = vaddq_s32(left[30], al[25]);
+ br[30] = vaddq_s32(right[30], ar[25]);
+ bl[31] = vaddq_s32(left[31], al[24]);
+ br[31] = vaddq_s32(right[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18],
+ &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19],
+ &ar[19]);
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+ cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+ &al[20], &ar[20]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+ cospi_24_64, &bl[2], &br[2], &bl[3],
+ &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+ cospi_24_64, &bl[14], &br[14], &bl[9],
+ &br[9]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+ &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+ cospi_28_64, &al[4], &ar[4], &al[7],
+ &ar[7]);
+ butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+ cospi_12_64, &al[5], &ar[5], &al[6],
+ &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17],
+ &ar[17]);
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+ cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+ &al[18], &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_20_64, cospi_12_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_12_64, -cospi_20_64, &al[25],
+ &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+ cospi_30_64, &bl[8], &br[8], &bl[15],
+ &br[15]);
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14],
+ &br[14]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_10_64, cospi_22_64, &bl[10], &br[10],
+ &bl[13], &br[13]);
+ butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+ cospi_26_64, cospi_6_64, &bl[11], &br[11],
+ &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31],
+ &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+ cospi_17_64, cospi_15_64, &al[17], &ar[17],
+ &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23],
+ &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+ cospi_25_64, cospi_7_64, &al[25], &ar[25],
+ &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27],
+ &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_21_64, cospi_11_64, &al[21], &ar[21],
+ &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_13_64, cospi_19_64, &al[13], &ar[13],
+ &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+ cospi_29_64, cospi_3_64, &al[29], &ar[29],
+ &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // For the "rd" version, all the values are rounded down after stage 2 to keep
+ // the values in 16 bits.
+ al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
+ ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
+ al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
+ ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
+ al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
+ ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
+ al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
+ ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
+ al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
+ ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
+ al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
+ ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
+ al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
+ ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
+ al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
+ ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
+
+ al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
+ ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
+ al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
+ ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
+ al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
+ ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
+ al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
+ ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
+ al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
+ ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
+ al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
+ ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
+ al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
+ ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
+ al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
+ ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
+
+ al[16] = add_round_shift_s32(left[16]);
+ ar[16] = add_round_shift_s32(right[16]);
+ al[17] = add_round_shift_s32(left[17]);
+ ar[17] = add_round_shift_s32(right[17]);
+ al[18] = add_round_shift_s32(left[18]);
+ ar[18] = add_round_shift_s32(right[18]);
+ al[19] = add_round_shift_s32(left[19]);
+ ar[19] = add_round_shift_s32(right[19]);
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[20] = add_round_shift_s32(al[20]);
+ ar[20] = add_round_shift_s32(ar[20]);
+ al[21] = add_round_shift_s32(al[21]);
+ ar[21] = add_round_shift_s32(ar[21]);
+ al[22] = add_round_shift_s32(al[22]);
+ ar[22] = add_round_shift_s32(ar[22]);
+ al[23] = add_round_shift_s32(al[23]);
+ ar[23] = add_round_shift_s32(ar[23]);
+ al[24] = add_round_shift_s32(al[24]);
+ ar[24] = add_round_shift_s32(ar[24]);
+ al[25] = add_round_shift_s32(al[25]);
+ ar[25] = add_round_shift_s32(ar[25]);
+ al[26] = add_round_shift_s32(al[26]);
+ ar[26] = add_round_shift_s32(ar[26]);
+ al[27] = add_round_shift_s32(al[27]);
+ ar[27] = add_round_shift_s32(ar[27]);
+
+ al[28] = add_round_shift_s32(left[28]);
+ ar[28] = add_round_shift_s32(right[28]);
+ al[29] = add_round_shift_s32(left[29]);
+ ar[29] = add_round_shift_s32(right[29]);
+ al[30] = add_round_shift_s32(left[30]);
+ ar[30] = add_round_shift_s32(right[30]);
+ al[31] = add_round_shift_s32(left[31]);
+ ar[31] = add_round_shift_s32(right[31]);
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[16], al[23]);
+ br[16] = vaddq_s32(ar[16], ar[23]);
+ bl[17] = vaddq_s32(al[17], al[22]);
+ br[17] = vaddq_s32(ar[17], ar[22]);
+ bl[18] = vaddq_s32(al[18], al[21]);
+ br[18] = vaddq_s32(ar[18], ar[21]);
+ bl[19] = vaddq_s32(al[19], al[20]);
+ br[19] = vaddq_s32(ar[19], ar[20]);
+
+ bl[20] = vsubq_s32(al[19], al[20]);
+ br[20] = vsubq_s32(ar[19], ar[20]);
+ bl[21] = vsubq_s32(al[18], al[21]);
+ br[21] = vsubq_s32(ar[18], ar[21]);
+ bl[22] = vsubq_s32(al[17], al[22]);
+ br[22] = vsubq_s32(ar[17], ar[22]);
+ bl[23] = vsubq_s32(al[16], al[23]);
+ br[23] = vsubq_s32(ar[16], ar[23]);
+
+ bl[24] = vsubq_s32(al[31], al[24]);
+ br[24] = vsubq_s32(ar[31], ar[24]);
+ bl[25] = vsubq_s32(al[30], al[25]);
+ br[25] = vsubq_s32(ar[30], ar[25]);
+ bl[26] = vsubq_s32(al[29], al[26]);
+ br[26] = vsubq_s32(ar[29], ar[26]);
+ bl[27] = vsubq_s32(al[28], al[27]);
+ br[27] = vsubq_s32(ar[28], ar[27]);
+
+ bl[28] = vaddq_s32(al[28], al[27]);
+ br[28] = vaddq_s32(ar[28], ar[27]);
+ bl[29] = vaddq_s32(al[29], al[26]);
+ br[29] = vaddq_s32(ar[29], ar[26]);
+ bl[30] = vaddq_s32(al[30], al[25]);
+ br[30] = vaddq_s32(ar[30], ar[25]);
+ bl[31] = vaddq_s32(al[31], al[24]);
+ br[31] = vaddq_s32(ar[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
+ butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
+ butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
+ -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
+ -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
+ &bl[2], &br[2], &bl[3], &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
+ &bl[14], &br[14], &bl[9], &br[9]);
+ butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
+ -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
+ &al[4], &ar[4], &al[7], &ar[7]);
+ butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
+ &al[5], &ar[5], &al[6], &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
+ -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
+ cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
+ -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
+ &bl[8], &br[8], &bl[15], &br[15]);
+ butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
+ butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
+ cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
+ butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
+ cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
+ cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
+ cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
+ cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
+ cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
+ cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c
new file mode 100644
index 0000000000..3b9196fae9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // input[M * stride] * 16
+ int16x4_t in[4];
+ in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+ in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+ in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+ in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+ // If the very first value != 0, then add 1.
+ if (input[0] != 0) {
+ const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+ in[0] = vadd_s16(in[0], one);
+ }
+ vpx_fdct4x4_pass1_neon(in);
+ vpx_fdct4x4_pass2_neon(in);
+ {
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ const int16x8_t one = vdupq_n_s16(1);
+ int16x8_t out_01 = vcombine_s16(in[0], in[1]);
+ int16x8_t out_23 = vcombine_s16(in[2], in[3]);
+ out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+ out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+ store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+ store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ static const int32x4_t const_1000 = { 1, 0, 0, 0 };
+ const int32x4_t const_one = vdupq_n_s32(1);
+
+ // input[M * stride] * 16
+ int32x4_t in[4];
+ in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+ in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+ in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+ in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+ // If the very first value != 0, then add 1.
+ if (input[0] != 0) {
+ in[0] = vaddq_s32(in[0], const_1000);
+ }
+
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ {
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2);
+ in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2);
+ in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2);
+ in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2);
+
+ vst1q_s32(final_output, in[0]);
+ vst1q_s32(final_output + 4, in[1]);
+ vst1q_s32(final_output + 8, in[2]);
+ vst1q_s32(final_output + 12, in[3]);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h
new file mode 100644
index 0000000000..de3db9774c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+ int16x4_t out[4];
+
+ const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+ const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+ butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
+ int16x4_t out[4];
+
+ const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+ const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+ butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
+ &out[2]);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
+ int32x4_t out[4];
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
+ const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
+ const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
+ const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
+
+ butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+ // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
+ &out[1], &out[3]);
+
+ transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c
new file mode 100644
index 0000000000..75ee6f2230
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
+
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // stage 1
+ int16x8_t in[8];
+ in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+ in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+ in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+ in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+ in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+ in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+ in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+ in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+
+ vpx_fdct8x8_pass1_neon(in);
+ vpx_fdct8x8_pass2_neon(in);
+ {
+ // from vpx_dct_sse2.c
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
+ const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
+ const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
+ const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
+ const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
+ const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
+ const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
+ const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
+ in[0] = vhsubq_s16(in[0], sign_in0);
+ in[1] = vhsubq_s16(in[1], sign_in1);
+ in[2] = vhsubq_s16(in[2], sign_in2);
+ in[3] = vhsubq_s16(in[3], sign_in3);
+ in[4] = vhsubq_s16(in[4], sign_in4);
+ in[5] = vhsubq_s16(in[5], sign_in5);
+ in[6] = vhsubq_s16(in[6], sign_in6);
+ in[7] = vhsubq_s16(in[7], sign_in7);
+ // store results
+ store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
+ store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
+ store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
+ store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
+ store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
+ store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
+ store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
+ store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // input[M * stride] * 16
+ int32x4_t left[8], right[8];
+ int16x8_t in[8];
+ in[0] = vld1q_s16(input + 0 * stride);
+ in[1] = vld1q_s16(input + 1 * stride);
+ in[2] = vld1q_s16(input + 2 * stride);
+ in[3] = vld1q_s16(input + 3 * stride);
+ in[4] = vld1q_s16(input + 4 * stride);
+ in[5] = vld1q_s16(input + 5 * stride);
+ in[6] = vld1q_s16(input + 6 * stride);
+ in[7] = vld1q_s16(input + 7 * stride);
+
+ left[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+ right[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+
+ vpx_highbd_fdct8x8_pass1_neon(left, right);
+ vpx_highbd_fdct8x8_pass2_neon(left, right);
+ {
+ left[0] = add_round_shift_half_s32(left[0]);
+ left[1] = add_round_shift_half_s32(left[1]);
+ left[2] = add_round_shift_half_s32(left[2]);
+ left[3] = add_round_shift_half_s32(left[3]);
+ left[4] = add_round_shift_half_s32(left[4]);
+ left[5] = add_round_shift_half_s32(left[5]);
+ left[6] = add_round_shift_half_s32(left[6]);
+ left[7] = add_round_shift_half_s32(left[7]);
+ right[0] = add_round_shift_half_s32(right[0]);
+ right[1] = add_round_shift_half_s32(right[1]);
+ right[2] = add_round_shift_half_s32(right[2]);
+ right[3] = add_round_shift_half_s32(right[3]);
+ right[4] = add_round_shift_half_s32(right[4]);
+ right[5] = add_round_shift_half_s32(right[5]);
+ right[6] = add_round_shift_half_s32(right[6]);
+ right[7] = add_round_shift_half_s32(right[7]);
+
+ // store results
+ vst1q_s32(final_output, left[0]);
+ vst1q_s32(final_output + 4, right[0]);
+ vst1q_s32(final_output + 8, left[1]);
+ vst1q_s32(final_output + 12, right[1]);
+ vst1q_s32(final_output + 16, left[2]);
+ vst1q_s32(final_output + 20, right[2]);
+ vst1q_s32(final_output + 24, left[3]);
+ vst1q_s32(final_output + 28, right[3]);
+ vst1q_s32(final_output + 32, left[4]);
+ vst1q_s32(final_output + 36, right[4]);
+ vst1q_s32(final_output + 40, left[5]);
+ vst1q_s32(final_output + 44, right[5]);
+ vst1q_s32(final_output + 48, left[6]);
+ vst1q_s32(final_output + 52, right[6]);
+ vst1q_s32(final_output + 56, left[7]);
+ vst1q_s32(final_output + 60, right[7]);
+ }
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h
new file mode 100644
index 0000000000..cc65157430
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t s[8], x[4], t[2];
+
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], t[0]);
+ x[1] = vsubq_s16(s[4], t[0]);
+ x[2] = vsubq_s16(s[7], t[1]);
+ x[3] = vaddq_s16(s[7], t[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t s[8], x[4], t[2];
+
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+ &out[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
+ &t[0]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], t[0]);
+ x[1] = vsubq_s16(s[4], t[0]);
+ x[2] = vsubq_s16(s[7], t[1]);
+ x[3] = vaddq_s16(s[7], t[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+ int16x8_t out[8];
+ vpx_fdct8x8_pass1_notranspose_neon(in, out);
+ // transpose 8x8
+ transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+ in[4] = out[4];
+ in[5] = out[5];
+ in[6] = out[6];
+ in[7] = out[7];
+}
+
+static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
+ int16x8_t out[8];
+ vpx_fdct8x8_pass2_notranspose_neon(in, out);
+ // transpose 8x8
+ transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+ in[4] = out[4];
+ in[5] = out[5];
+ in[6] = out[6];
+ in[7] = out[7];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // fdct4(step, step);
+ // x0 = s0 + s3;
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ // x1 = s1 + s2;
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ // x2 = s1 - s2;
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ // x3 = s0 - s3;
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[4], &right[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
+ &left[2], &right[2], &left[6], &right[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+ &tr[1], &tl[0], &tr[0]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], tl[0]);
+ xr[0] = vaddq_s32(sr[4], tr[0]);
+ xl[1] = vsubq_s32(sl[4], tl[0]);
+ xr[1] = vsubq_s32(sr[4], tr[0]);
+ xl[2] = vsubq_s32(sl[7], tl[1]);
+ xr[2] = vsubq_s32(sr[7], tr[1]);
+ xl[3] = vaddq_s32(sl[7], tl[1]);
+ xr[3] = vaddq_s32(sr[7], tr[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
+ &left[1], &right[1], &left[7], &right[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
+ &left[5], &right[5], &left[3], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // fdct4(step, step);
+ // x0 = s0 + s3;
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ // x1 = s1 + s2;
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ // x2 = s1 - s2;
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ // x3 = s0 - s3;
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[4], &right[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+ cospi_24_64, &left[2], &right[2], &left[6],
+ &right[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+ &tr[1], &tl[0], &tr[0]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], tl[0]);
+ xr[0] = vaddq_s32(sr[4], tr[0]);
+ xl[1] = vsubq_s32(sl[4], tl[0]);
+ xr[1] = vsubq_s32(sr[4], tr[0]);
+ xl[2] = vsubq_s32(sl[7], tl[1]);
+ xr[2] = vsubq_s32(sr[7], tr[1]);
+ xl[3] = vaddq_s32(sl[7], tl[1]);
+ xr[3] = vaddq_s32(sr[7], tr[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+ cospi_28_64, &left[1], &right[1], &left[7],
+ &right[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+ cospi_12_64, &left[5], &right[5], &left[3],
+ &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+ int32x4_t *right) {
+ vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+ transpose_s32_8x8_2(left, right, left, right);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
+ int32x4_t *right) {
+ vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
+ transpose_s32_8x8_2(left, right, left, right);
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h
new file mode 100644
index 0000000000..16f5c5fc0e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT_NEON_H_
+
+#include <arm_neon.h>
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on half vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a,
+ const int16x4_t b,
+ const tran_coef_t constant,
+ int16x4_t *add,
+ int16x4_t *sub) {
+ int16x4_t c = vdup_n_s16(2 * constant);
+ *add = vqrdmulh_s16(vadd_s16(a, b), c);
+ *sub = vqrdmulh_s16(vsub_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on full vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a,
+ const int16x8_t b,
+ const tran_coef_t constant,
+ int16x8_t *add,
+ int16x8_t *sub) {
+ int16x8_t c = vdupq_n_s16(2 * constant);
+ *add = vqrdmulhq_s16(vaddq_s16(a, b), c);
+ *sub = vqrdmulhq_s16(vsubq_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ int32x4_t c = vdupq_n_s32(constant << 17);
+ const int16x4_t a_lo = vget_low_s16(a);
+ const int16x4_t a_hi = vget_high_s16(a);
+ const int16x4_t b_lo = vget_low_s16(b);
+ const int16x4_t b_hi = vget_high_s16(b);
+ *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c);
+ *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c);
+ *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c);
+ *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ int32x4_t add_lo, add_hi, sub_lo, sub_hi;
+ butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo,
+ &sub_hi);
+ *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi));
+ *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_half(
+ const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+ int32x4_t *add, int32x4_t *sub) {
+ int32x4_t c = vdupq_n_s32(constant << 17);
+ *add = vqrdmulhq_s32(vaddl_s16(a, b), c);
+ *sub = vqrdmulhq_s32(vsubl_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on half vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half(
+ const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+ int16x4_t *add, int16x4_t *sub) {
+ int32x4_t add32, sub32;
+ butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32);
+ *add = vmovn_s32(add32);
+ *sub = vmovn_s32(sub32);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+ const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_narrow(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi;
+ butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo,
+ &sub32_hi);
+ *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi));
+ *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_noround(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant);
+ *add_lo = vmlaq_n_s32(a1, b_lo, constant);
+ *add_hi = vmlaq_n_s32(a2, b_hi, constant);
+ *sub_lo = vmlsq_n_s32(a3, b_lo, constant);
+ *sub_hi = vmlsq_n_s32(a4, b_hi, constant);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a,
+ const int32x4_t b,
+ const tran_coef_t constant,
+ int32x4_t *add,
+ int32x4_t *sub) {
+ const int32x4_t c = vdupq_n_s32(constant << 17);
+ *add = vqrdmulhq_s32(vaddq_s32(a, b), c);
+ *sub = vqrdmulhq_s32(vsubq_s32(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t c = vdupq_n_s32(constant << 17);
+ *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c);
+ *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c);
+ *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c);
+ *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_one_coeff_s32_s64_narrow(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ // ac holds the following values:
+ // ac: vget_low_s32(a_lo) * c, vget_high_s32(a_lo) * c,
+ // vget_low_s32(a_hi) * c, vget_high_s32(a_hi) * c
+ int64x2_t ac[4];
+ int64x2_t sum[4];
+ int64x2_t diff[4];
+
+ ac[0] = vmull_n_s32(vget_low_s32(a_lo), constant);
+ ac[1] = vmull_n_s32(vget_high_s32(a_lo), constant);
+ ac[2] = vmull_n_s32(vget_low_s32(a_hi), constant);
+ ac[3] = vmull_n_s32(vget_high_s32(a_hi), constant);
+
+ sum[0] = vmlal_n_s32(ac[0], vget_low_s32(b_lo), constant);
+ sum[1] = vmlal_n_s32(ac[1], vget_high_s32(b_lo), constant);
+ sum[2] = vmlal_n_s32(ac[2], vget_low_s32(b_hi), constant);
+ sum[3] = vmlal_n_s32(ac[3], vget_high_s32(b_hi), constant);
+ *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+ *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+ diff[0] = vmlsl_n_s32(ac[0], vget_low_s32(b_lo), constant);
+ diff[1] = vmlsl_n_s32(ac[1], vget_high_s32(b_lo), constant);
+ diff[2] = vmlsl_n_s32(ac[2], vget_low_s32(b_hi), constant);
+ diff[3] = vmlsl_n_s32(ac[3], vget_high_s32(b_hi), constant);
+ *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+ *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
+ const int32x4_t a, const int32x4_t b, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) {
+ const int32x2_t a_lo = vget_low_s32(a);
+ const int32x2_t a_hi = vget_high_s32(a);
+ const int32x2_t b_lo = vget_low_s32(b);
+ const int32x2_t b_hi = vget_high_s32(b);
+
+ const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1);
+ const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1);
+ const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2);
+ const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2);
+
+ const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2);
+ const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2);
+ const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1);
+ const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1);
+
+ *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+ vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+ *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+ vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 64-bit values
+// returns results without rounding
+static INLINE void butterfly_two_coeff_s32_s64_noround(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int64x2_t *add_lo /*[2]*/,
+ int64x2_t *add_hi /*[2]*/, int64x2_t *sub_lo /*[2]*/,
+ int64x2_t *sub_hi /*[2]*/) {
+ // ac1/ac2 hold the following values:
+ // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+ // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+ // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+ // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+ int64x2_t ac1[4];
+ int64x2_t ac2[4];
+
+ ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+ ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+ ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+ ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+ ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+ ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+ ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+ ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+ add_lo[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+ add_lo[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+ add_hi[0] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+ add_hi[1] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+
+ sub_lo[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+ sub_lo[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+ sub_hi[0] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+ sub_hi[1] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ // ac1/ac2 hold the following values:
+ // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+ // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+ // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+ // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+ int64x2_t ac1[4];
+ int64x2_t ac2[4];
+ int64x2_t sum[4];
+ int64x2_t diff[4];
+
+ ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+ ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+ ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+ ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+ ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+ ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+ ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+ ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+ sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+ sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+ sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+ sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+ *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+ *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+ diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+ diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+ diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+ diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+ *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+ *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s16_s32_noround(
+ const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo,
+ const int16x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmull_n_s16(a_lo, constant1);
+ const int32x4_t a2 = vmull_n_s16(a_hi, constant1);
+ const int32x4_t a3 = vmull_n_s16(a_lo, constant2);
+ const int32x4_t a4 = vmull_n_s16(a_hi, constant2);
+ *add_lo = vmlal_n_s16(a1, b_lo, constant2);
+ *add_hi = vmlal_n_s16(a2, b_hi, constant2);
+ *sub_lo = vmlsl_n_s16(a3, b_lo, constant1);
+ *sub_hi = vmlsl_n_s16(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_noround(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+ *add_lo = vmlaq_n_s32(a1, b_lo, constant2);
+ *add_hi = vmlaq_n_s32(a2, b_hi, constant2);
+ *sub_lo = vmlsq_n_s32(a3, b_lo, constant1);
+ *sub_hi = vmlsq_n_s32(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_half(const int16x4_t a,
+ const int16x4_t b,
+ const tran_coef_t constant1,
+ const tran_coef_t constant2,
+ int16x4_t *add, int16x4_t *sub) {
+ const int32x4_t a1 = vmull_n_s16(a, constant1);
+ const int32x4_t a2 = vmull_n_s16(a, constant2);
+ const int32x4_t sum = vmlal_n_s16(a1, b, constant2);
+ const int32x4_t diff = vmlsl_n_s16(a2, b, constant1);
+ *add = vqrshrn_n_s32(sum, DCT_CONST_BITS);
+ *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_coef_t constant1,
+ const tran_coef_t constant2,
+ int16x8_t *add, int16x8_t *sub) {
+ const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1);
+ const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1);
+ const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2);
+ const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2);
+ const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2);
+ const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2);
+ const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1);
+ const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+ const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2);
+ const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2);
+ const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1);
+ const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+ const int16x8_t one = vdupq_n_s16(1);
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift and round,
+// return narrowed results
+static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo,
+ const int32x4_t a_hi) {
+ const int32x4_t one = vdupq_n_s32(1);
+ const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+ const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+ const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+ const int16x4_t b_lo =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+ const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+ const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+ const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+ const int16x4_t b_hi =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+ return vcombine_s16(b_lo, b_hi);
+}
+
+// Add 1 if negative, and shift by 1.
+// In practice, add the sign bit, then shift and round
+static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) {
+ const int32x4_t one = vdupq_n_s32(1);
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
+}
+
+static INLINE int32x4_t add_s64_round_narrow(const int64x2_t *a /*[2]*/,
+ const int64x2_t *b /*[2]*/) {
+ int64x2_t result[2];
+ result[0] = vaddq_s64(a[0], b[0]);
+ result[1] = vaddq_s64(a[1], b[1]);
+ return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+ vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t sub_s64_round_narrow(const int64x2_t *a /*[2]*/,
+ const int64x2_t *b /*[2]*/) {
+ int64x2_t result[2];
+ result[0] = vsubq_s64(a[0], b[0]);
+ result[1] = vsubq_s64(a[1], b[1]);
+ return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+ vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t add_s32_s64_narrow(const int32x4_t a,
+ const int32x4_t b) {
+ int64x2_t a64[2], b64[2], result[2];
+ a64[0] = vmovl_s32(vget_low_s32(a));
+ a64[1] = vmovl_s32(vget_high_s32(a));
+ b64[0] = vmovl_s32(vget_low_s32(b));
+ b64[1] = vmovl_s32(vget_high_s32(b));
+ result[0] = vaddq_s64(a64[0], b64[0]);
+ result[1] = vaddq_s64(a64[1], b64[1]);
+ return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
+static INLINE int32x4_t sub_s32_s64_narrow(const int32x4_t a,
+ const int32x4_t b) {
+ int64x2_t a64[2], b64[2], result[2];
+ a64[0] = vmovl_s32(vget_low_s32(a));
+ a64[1] = vmovl_s32(vget_high_s32(a));
+ b64[0] = vmovl_s32(vget_low_s32(b));
+ b64[1] = vmovl_s32(vget_high_s32(b));
+ result[0] = vsubq_s64(a64[0], b64[0]);
+ result[1] = vsubq_s64(a64[1], b64[1]);
+ return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
+#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c
new file mode 100644
index 0000000000..718dba0d91
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int16x4_t a0, a1, a2, a3;
+ int16x8_t b0, b1;
+ int16x8_t c;
+
+ a0 = vld1_s16(input);
+ input += stride;
+ a1 = vld1_s16(input);
+ input += stride;
+ a2 = vld1_s16(input);
+ input += stride;
+ a3 = vld1_s16(input);
+
+ b0 = vcombine_s16(a0, a1);
+ b1 = vcombine_s16(a2, a3);
+
+ c = vaddq_s16(b0, b1);
+
+ output[0] = (tran_low_t)(horizontal_add_int16x8(c) << 1);
+ output[1] = 0;
+}
+
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int r;
+ int16x8_t sum = vld1q_s16(&input[0]);
+
+ for (r = 1; r < 8; ++r) {
+ const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+ sum = vaddq_s16(sum, input_00);
+ }
+
+ output[0] = (tran_low_t)horizontal_add_int16x8(sum);
+ output[1] = 0;
+}
+
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int r;
+ int16x8_t left = vld1q_s16(input);
+ int16x8_t right = vld1q_s16(input + 8);
+ int32_t sum;
+ input += stride;
+
+ for (r = 1; r < 16; ++r) {
+ const int16x8_t a = vld1q_s16(input);
+ const int16x8_t b = vld1q_s16(input + 8);
+ input += stride;
+ left = vaddq_s16(left, a);
+ right = vaddq_s16(right, b);
+ }
+
+ sum = horizontal_add_int16x8(left) + horizontal_add_int16x8(right);
+
+ output[0] = (tran_low_t)(sum >> 1);
+ output[1] = 0;
+}
+
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int r;
+ int16x8_t a0 = vld1q_s16(input);
+ int16x8_t a1 = vld1q_s16(input + 8);
+ int16x8_t a2 = vld1q_s16(input + 16);
+ int16x8_t a3 = vld1q_s16(input + 24);
+ int32_t sum;
+ input += stride;
+
+ for (r = 1; r < 32; ++r) {
+ const int16x8_t b0 = vld1q_s16(input);
+ const int16x8_t b1 = vld1q_s16(input + 8);
+ const int16x8_t b2 = vld1q_s16(input + 16);
+ const int16x8_t b3 = vld1q_s16(input + 24);
+ input += stride;
+ a0 = vaddq_s16(a0, b0);
+ a1 = vaddq_s16(a1, b1);
+ a2 = vaddq_s16(a2, b2);
+ a3 = vaddq_s16(a3, b3);
+ }
+
+ sum = horizontal_add_int16x8(a0);
+ sum += horizontal_add_int16x8(a1);
+ sum += horizontal_add_int16x8(a2);
+ sum += horizontal_add_int16x8(a3);
+ output[0] = (tran_low_t)(sum >> 3);
+ output[1] = 0;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+ int32_t sum;
+
+ int r = 0;
+ do {
+ const int16x8_t a = vld1q_s16(input);
+ const int16x8_t b = vld1q_s16(input + 8);
+ input += stride;
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b));
+ r++;
+ } while (r < 16);
+
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+ partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+ sum = horizontal_add_int32x4(partial_sum[0]);
+
+ output[0] = (tran_low_t)(sum >> 1);
+ output[1] = 0;
+}
+
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+
+ int32_t sum;
+
+ int r = 0;
+ do {
+ const int16x8_t a0 = vld1q_s16(input);
+ const int16x8_t a1 = vld1q_s16(input + 8);
+ const int16x8_t a2 = vld1q_s16(input + 16);
+ const int16x8_t a3 = vld1q_s16(input + 24);
+ input += stride;
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0));
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3));
+ r++;
+ } while (r < 32);
+
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+ partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+ sum = horizontal_add_int32x4(partial_sum[0]);
+
+ output[0] = (tran_low_t)(sum >> 3);
+ output[1] = 0;
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c
new file mode 100644
index 0000000000..f6b6d7e3ce
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+ int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6, int16x8_t *a7) {
+ const int16x8_t b0 = vaddq_s16(*a0, *a1);
+ const int16x8_t b1 = vsubq_s16(*a0, *a1);
+ const int16x8_t b2 = vaddq_s16(*a2, *a3);
+ const int16x8_t b3 = vsubq_s16(*a2, *a3);
+ const int16x8_t b4 = vaddq_s16(*a4, *a5);
+ const int16x8_t b5 = vsubq_s16(*a4, *a5);
+ const int16x8_t b6 = vaddq_s16(*a6, *a7);
+ const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+ const int16x8_t c0 = vaddq_s16(b0, b2);
+ const int16x8_t c1 = vaddq_s16(b1, b3);
+ const int16x8_t c2 = vsubq_s16(b0, b2);
+ const int16x8_t c3 = vsubq_s16(b1, b3);
+ const int16x8_t c4 = vaddq_s16(b4, b6);
+ const int16x8_t c5 = vaddq_s16(b5, b7);
+ const int16x8_t c6 = vsubq_s16(b4, b6);
+ const int16x8_t c7 = vsubq_s16(b5, b7);
+
+ *a0 = vaddq_s16(c0, c4);
+ *a1 = vsubq_s16(c2, c6);
+ *a2 = vsubq_s16(c0, c4);
+ *a3 = vaddq_s16(c2, c6);
+ *a4 = vaddq_s16(c3, c7);
+ *a5 = vsubq_s16(c3, c7);
+ *a6 = vsubq_s16(c1, c5);
+ *a7 = vaddq_s16(c1, c5);
+}
+
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x8_t a0 = vld1q_s16(src_diff);
+ int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+ int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+ int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+ int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+ int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+ int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+ int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ // Skip the second transpose because it is not required.
+
+ store_s16q_to_tran_low(coeff + 0, a0);
+ store_s16q_to_tran_low(coeff + 8, a1);
+ store_s16q_to_tran_low(coeff + 16, a2);
+ store_s16q_to_tran_low(coeff + 24, a3);
+ store_s16q_to_tran_low(coeff + 32, a4);
+ store_s16q_to_tran_low(coeff + 40, a5);
+ store_s16q_to_tran_low(coeff + 48, a6);
+ store_s16q_to_tran_low(coeff + 56, a7);
+}
+
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int i;
+
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ vpx_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+ /* Top right. */
+ vpx_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+ /* Bottom left. */
+ vpx_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+ /* Bottom right. */
+ vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+ for (i = 0; i < 64; i += 8) {
+ const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+ const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64);
+ const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128);
+ const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192);
+
+ const int16x8_t b0 = vhaddq_s16(a0, a1);
+ const int16x8_t b1 = vhsubq_s16(a0, a1);
+ const int16x8_t b2 = vhaddq_s16(a2, a3);
+ const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+ const int16x8_t c0 = vaddq_s16(b0, b2);
+ const int16x8_t c1 = vaddq_s16(b1, b3);
+ const int16x8_t c2 = vsubq_s16(b0, b2);
+ const int16x8_t c3 = vsubq_s16(b1, b3);
+
+ store_s16q_to_tran_low(coeff + 0, c0);
+ store_s16q_to_tran_low(coeff + 64, c1);
+ store_s16q_to_tran_low(coeff + 128, c2);
+ store_s16q_to_tran_low(coeff + 192, c3);
+
+ coeff += 8;
+ }
+}
+
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int i;
+
+ /* Rearrange 32x32 to 16x64 and remove stride.
+ * Top left first. */
+ vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+ /* Top right. */
+ vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
+ coeff + 256);
+ /* Bottom left. */
+ vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
+ coeff + 512);
+ /* Bottom right. */
+ vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
+ coeff + 768);
+
+ for (i = 0; i < 256; i += 8) {
+ const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+ const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256);
+ const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512);
+ const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768);
+
+ const int16x8_t b0 = vhaddq_s16(a0, a1);
+ const int16x8_t b1 = vhsubq_s16(a0, a1);
+ const int16x8_t b2 = vhaddq_s16(a2, a3);
+ const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+ const int16x8_t c0 = vhaddq_s16(b0, b2);
+ const int16x8_t c1 = vhaddq_s16(b1, b3);
+ const int16x8_t c2 = vhsubq_s16(b0, b2);
+ const int16x8_t c3 = vhsubq_s16(b1, b3);
+
+ store_s16q_to_tran_low(coeff + 0, c0);
+ store_s16q_to_tran_low(coeff + 256, c1);
+ store_s16q_to_tran_low(coeff + 512, c2);
+ store_s16q_to_tran_low(coeff + 768, c3);
+
+ coeff += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c
new file mode 100644
index 0000000000..4265596c8c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+ const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * p, p);
+ const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * p, p);
+ return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4;
+}
+
+uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+ uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
+
+ load_u16_8x8(a_ptr, p, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ sum = vaddq_u16(a0, a1);
+ sum = vaddq_u16(sum, a2);
+ sum = vaddq_u16(sum, a3);
+ sum = vaddq_u16(sum, a4);
+ sum = vaddq_u16(sum, a5);
+ sum = vaddq_u16(sum, a6);
+ sum = vaddq_u16(sum, a7);
+
+ return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
+}
+
+// coeff: 32 bits, dynamic range [-2147483648, 2147483647].
+// length: value range {16, 64, 256, 1024}.
+// satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024]
+int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) {
+ int64x2_t sum_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ int32x4_t abs0, abs1;
+ const int32x4_t s0 = load_tran_low_to_s32q(coeff);
+ const int32x4_t s1 = load_tran_low_to_s32q(coeff + 4);
+
+ abs0 = vabsq_s32(s0);
+ sum_s64[0] = vpadalq_s32(sum_s64[0], abs0);
+ abs1 = vabsq_s32(s1);
+ sum_s64[1] = vpadalq_s32(sum_s64[1], abs1);
+
+ length -= 8;
+ coeff += 8;
+ } while (length != 0);
+
+ return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1]));
+}
+
+void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
+ int dp, int *min, int *max) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
+
+ const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
+ const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
+ const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
+ const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
+ const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
+ const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
+ const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
+ const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
+
+ const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
+ const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
+ const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
+ const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
+ const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
+ const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
+ const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
+ const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
+
+ const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
+ const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
+ const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
+ const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
+ const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
+ const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
+ const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
+ const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
+
+ const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
+ const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
+ const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
+ const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
+
+ const uint16x8_t max0123 = vmaxq_u16(max01, max23);
+ const uint16x8_t max4567 = vmaxq_u16(max45, max67);
+ const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
+
+ const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
+ const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
+ const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
+ const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
+
+ const uint16x8_t min0123 = vminq_u16(min01, min23);
+ const uint16x8_t min4567 = vminq_u16(min45, min67);
+ const uint16x8_t min07 = vminq_u16(min0123, min4567);
+
+#if VPX_ARCH_AARCH64
+ *min = *max = 0; // Clear high bits
+ *((uint16_t *)max) = vmaxvq_u16(max07);
+ *((uint16_t *)min) = vminvq_u16(min07);
+#else
+ // Split into 64-bit vectors and execute pairwise min/max.
+ uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
+ uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
+
+ // Enough runs of vpmax/min propagate the max/min values to every position.
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ *min = *max = 0; // Clear high bits
+ // Store directly to avoid costly neon->gpr transfer.
+ vst1_lane_u16((uint16_t *)max, ab_max, 0);
+ vst1_lane_u16((uint16_t *)min, ab_min, 0);
+#endif
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000000..3063acbb3e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
+ int i = height;
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ uint16x4_t avg = vrhadd_u16(p, r);
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c
new file mode 100644
index 0000000000..499eb65462
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2, int16x8_t *a3,
+ int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6,
+ int16x8_t *a7) {
+ int16x8_t b0 = vaddq_s16(*a0, *a1);
+ int16x8_t b1 = vsubq_s16(*a0, *a1);
+ int16x8_t b2 = vaddq_s16(*a2, *a3);
+ int16x8_t b3 = vsubq_s16(*a2, *a3);
+ int16x8_t b4 = vaddq_s16(*a4, *a5);
+ int16x8_t b5 = vsubq_s16(*a4, *a5);
+ int16x8_t b6 = vaddq_s16(*a6, *a7);
+ int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+ int16x8_t c0 = vaddq_s16(b0, b2);
+ int16x8_t c2 = vsubq_s16(b0, b2);
+ int16x8_t c1 = vaddq_s16(b1, b3);
+ int16x8_t c3 = vsubq_s16(b1, b3);
+ int16x8_t c4 = vaddq_s16(b4, b6);
+ int16x8_t c6 = vsubq_s16(b4, b6);
+ int16x8_t c5 = vaddq_s16(b5, b7);
+ int16x8_t c7 = vsubq_s16(b5, b7);
+
+ *a0 = vaddq_s16(c0, c4);
+ *a2 = vsubq_s16(c0, c4);
+ *a7 = vaddq_s16(c1, c5);
+ *a6 = vsubq_s16(c1, c5);
+ *a3 = vaddq_s16(c2, c6);
+ *a1 = vsubq_s16(c2, c6);
+ *a4 = vaddq_s16(c3, c7);
+ *a5 = vsubq_s16(c3, c7);
+}
+
+static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1,
+ int16x4_t a2, int16x4_t a3,
+ int16x4_t a4, int16x4_t a5,
+ int16x4_t a6, int16x4_t a7,
+ tran_low_t *coeff) {
+ int32x4_t b0 = vaddl_s16(a0, a1);
+ int32x4_t b1 = vsubl_s16(a0, a1);
+ int32x4_t b2 = vaddl_s16(a2, a3);
+ int32x4_t b3 = vsubl_s16(a2, a3);
+ int32x4_t b4 = vaddl_s16(a4, a5);
+ int32x4_t b5 = vsubl_s16(a4, a5);
+ int32x4_t b6 = vaddl_s16(a6, a7);
+ int32x4_t b7 = vsubl_s16(a6, a7);
+
+ int32x4_t c0 = vaddq_s32(b0, b2);
+ int32x4_t c2 = vsubq_s32(b0, b2);
+ int32x4_t c1 = vaddq_s32(b1, b3);
+ int32x4_t c3 = vsubq_s32(b1, b3);
+ int32x4_t c4 = vaddq_s32(b4, b6);
+ int32x4_t c6 = vsubq_s32(b4, b6);
+ int32x4_t c5 = vaddq_s32(b5, b7);
+ int32x4_t c7 = vsubq_s32(b5, b7);
+
+ int32x4_t d0 = vaddq_s32(c0, c4);
+ int32x4_t d2 = vsubq_s32(c0, c4);
+ int32x4_t d7 = vaddq_s32(c1, c5);
+ int32x4_t d6 = vsubq_s32(c1, c5);
+ int32x4_t d3 = vaddq_s32(c2, c6);
+ int32x4_t d1 = vsubq_s32(c2, c6);
+ int32x4_t d4 = vaddq_s32(c3, c7);
+ int32x4_t d5 = vsubq_s32(c3, c7);
+
+ store_s32q_to_tran_low(coeff + 0, d0);
+ store_s32q_to_tran_low(coeff + 4, d1);
+ store_s32q_to_tran_low(coeff + 8, d2);
+ store_s32q_to_tran_low(coeff + 12, d3);
+ store_s32q_to_tran_low(coeff + 16, d4);
+ store_s32q_to_tran_low(coeff + 20, d5);
+ store_s32q_to_tran_low(coeff + 24, d6);
+ store_s32q_to_tran_low(coeff + 28, d7);
+}
+
+void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x4_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+ int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride);
+ int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride);
+ int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride);
+ int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride);
+ int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride);
+ int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride);
+ int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride);
+ int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride);
+
+ // For the first pass we can stay in 16-bit elements (4095*8 = 32760).
+ hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ // For the second pass we need to widen to 32-bit elements, so we're
+ // processing 4 columns at a time.
+ // Skip the second transpose because it is not required.
+
+ b0 = vget_low_s16(s0);
+ b1 = vget_low_s16(s1);
+ b2 = vget_low_s16(s2);
+ b3 = vget_low_s16(s3);
+ b4 = vget_low_s16(s4);
+ b5 = vget_low_s16(s5);
+ b6 = vget_low_s16(s6);
+ b7 = vget_low_s16(s7);
+
+ hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff);
+
+ b0 = vget_high_s16(s0);
+ b1 = vget_high_s16(s1);
+ b2 = vget_high_s16(s2);
+ b3 = vget_high_s16(s3);
+ b4 = vget_high_s16(s4);
+ b5 = vget_high_s16(s5);
+ b6 = vget_high_s16(s6);
+ b7 = vget_high_s16(s7);
+
+ hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32);
+}
+
+void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int i = 0;
+
+ // Rearrange 16x16 to 8x32 and remove stride.
+ // Top left first.
+ vpx_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff);
+ // Top right.
+ vpx_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64);
+ // Bottom left.
+ vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride,
+ coeff + 128);
+ // Bottom right.
+ vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride,
+ coeff + 192);
+
+ do {
+ int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+ int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 64);
+ int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 128);
+ int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 192);
+
+ int32x4_t b0 = vhaddq_s32(a0, a1);
+ int32x4_t b1 = vhsubq_s32(a0, a1);
+ int32x4_t b2 = vhaddq_s32(a2, a3);
+ int32x4_t b3 = vhsubq_s32(a2, a3);
+
+ int32x4_t c0 = vaddq_s32(b0, b2);
+ int32x4_t c1 = vaddq_s32(b1, b3);
+ int32x4_t c2 = vsubq_s32(b0, b2);
+ int32x4_t c3 = vsubq_s32(b1, b3);
+
+ store_s32q_to_tran_low(coeff + 4 * i, c0);
+ store_s32q_to_tran_low(coeff + 4 * i + 64, c1);
+ store_s32q_to_tran_low(coeff + 4 * i + 128, c2);
+ store_s32q_to_tran_low(coeff + 4 * i + 192, c3);
+ } while (++i < 16);
+}
+
+void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int i = 0;
+
+ // Rearrange 32x32 to 16x64 and remove stride.
+ // Top left first.
+ vpx_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff);
+ // Top right.
+ vpx_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256);
+ // Bottom left.
+ vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride,
+ coeff + 512);
+ // Bottom right.
+ vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride,
+ coeff + 768);
+
+ do {
+ int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+ int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 256);
+ int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512);
+ int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768);
+
+ int32x4_t b0 = vhaddq_s32(a0, a1);
+ int32x4_t b1 = vhsubq_s32(a0, a1);
+ int32x4_t b2 = vhaddq_s32(a2, a3);
+ int32x4_t b3 = vhsubq_s32(a2, a3);
+
+ int32x4_t c0 = vhaddq_s32(b0, b2);
+ int32x4_t c1 = vhaddq_s32(b1, b3);
+ int32x4_t c2 = vhsubq_s32(b0, b2);
+ int32x4_t c3 = vhsubq_s32(b1, b3);
+
+ store_s32q_to_tran_low(coeff + 4 * i, c0);
+ store_s32q_to_tran_low(coeff + 4 * i + 256, c1);
+ store_s32q_to_tran_low(coeff + 4 * i + 512, c2);
+ store_s32q_to_tran_low(coeff + 4 * i + 768, c3);
+ } while (++i < 64);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
new file mode 100644
index 0000000000..654ab42ca4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -0,0 +1,1361 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE int32x4_t dct_const_round_shift_high_4(const int64x2x2_t in) {
+ int32x2x2_t t32;
+
+ t32.val[0] = vrshrn_n_s64(in.val[0], DCT_CONST_BITS);
+ t32.val[1] = vrshrn_n_s64(in.val[1], DCT_CONST_BITS);
+ return vcombine_s32(t32.val[0], t32.val[1]);
+}
+
+static INLINE void dct_const_round_shift_high_4_dual(
+ const int64x2x2_t *const in, int32x4_t *const d0, int32x4_t *const d1) {
+ *d0 = dct_const_round_shift_high_4(in[0]);
+ *d1 = dct_const_round_shift_high_4(in[1]);
+}
+
+static INLINE int32x4x2_t
+dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t *const in) {
+ int32x4x2_t out;
+ out.val[0] = dct_const_round_shift_high_4(in[0]);
+ out.val[1] = dct_const_round_shift_high_4(in[1]);
+ return out;
+}
+
+static INLINE void dct_const_round_shift_high_4x2x2(const int64x2x2_t *const in,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ *d0 = dct_const_round_shift_high_4x2_int64x2x2(in + 0);
+ *d1 = dct_const_round_shift_high_4x2_int64x2x2(in + 2);
+}
+
+static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_2_30_10_22,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_4_12_20N_28,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_6_26N_14_18N,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_2_30_10_22,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_4_12_20N_28,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_6_26N_14_18N,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_q_kernel(
+ const int32x4x2_t s0, const int32x4x2_t s1, const int32x4_t cospi_0_8_16_24,
+ int64x2x2_t *const t) {
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_d_kernel(
+ const int32x4_t s0, const int32x4_t s1, const int32x4_t cospi_0_8_16_24,
+ int64x2x2_t *const t) {
+ t[0].val[0] =
+ vmull_lane_s32(vget_low_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] =
+ vmull_lane_s32(vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] =
+ vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] =
+ vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s0),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s0),
+ vget_low_s32(cospi_0_8_16_24), 1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0,
+ const int32x4_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4_t *const d0,
+ int32x4_t *const d1) {
+ int64x2x2_t t[2];
+
+ highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
+ dct_const_round_shift_high_4_dual(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
+ t[2].val[0] = vsubq_s64(vdupq_n_s64(0), t[2].val[0]);
+ t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]);
+ t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]);
+ t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,
+ const int32x4_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4_t *const d0,
+ int32x4_t *const d1) {
+ int64x2x2_t t[2];
+
+ highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
+ t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]);
+ t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]);
+ dct_const_round_shift_high_4_dual(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[6];
+
+ t[4].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[4].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[5].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[5].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[0] = vmlsl_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[2].val[0] = vmlal_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[2].val[1] = vmlal_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[3].val[0] = vmlal_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0,
+ const int32x4_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4_t *const d0,
+ int32x4_t *const d1) {
+ int64x2x2_t t[3];
+
+ t[2].val[0] =
+ vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
+ t[2].val[1] =
+ vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ dct_const_round_shift_high_4_dual(t, d0, d1);
+}
+
+static INLINE void highbd_idct16x16_add_stage7_dual(
+ const int32x4x2_t *const step2, int32x4x2_t *const out) {
+ out[0].val[0] = vaddq_s32(step2[0].val[0], step2[15].val[0]);
+ out[0].val[1] = vaddq_s32(step2[0].val[1], step2[15].val[1]);
+ out[1].val[0] = vaddq_s32(step2[1].val[0], step2[14].val[0]);
+ out[1].val[1] = vaddq_s32(step2[1].val[1], step2[14].val[1]);
+ out[2].val[0] = vaddq_s32(step2[2].val[0], step2[13].val[0]);
+ out[2].val[1] = vaddq_s32(step2[2].val[1], step2[13].val[1]);
+ out[3].val[0] = vaddq_s32(step2[3].val[0], step2[12].val[0]);
+ out[3].val[1] = vaddq_s32(step2[3].val[1], step2[12].val[1]);
+ out[4].val[0] = vaddq_s32(step2[4].val[0], step2[11].val[0]);
+ out[4].val[1] = vaddq_s32(step2[4].val[1], step2[11].val[1]);
+ out[5].val[0] = vaddq_s32(step2[5].val[0], step2[10].val[0]);
+ out[5].val[1] = vaddq_s32(step2[5].val[1], step2[10].val[1]);
+ out[6].val[0] = vaddq_s32(step2[6].val[0], step2[9].val[0]);
+ out[6].val[1] = vaddq_s32(step2[6].val[1], step2[9].val[1]);
+ out[7].val[0] = vaddq_s32(step2[7].val[0], step2[8].val[0]);
+ out[7].val[1] = vaddq_s32(step2[7].val[1], step2[8].val[1]);
+ out[8].val[0] = vsubq_s32(step2[7].val[0], step2[8].val[0]);
+ out[8].val[1] = vsubq_s32(step2[7].val[1], step2[8].val[1]);
+ out[9].val[0] = vsubq_s32(step2[6].val[0], step2[9].val[0]);
+ out[9].val[1] = vsubq_s32(step2[6].val[1], step2[9].val[1]);
+ out[10].val[0] = vsubq_s32(step2[5].val[0], step2[10].val[0]);
+ out[10].val[1] = vsubq_s32(step2[5].val[1], step2[10].val[1]);
+ out[11].val[0] = vsubq_s32(step2[4].val[0], step2[11].val[0]);
+ out[11].val[1] = vsubq_s32(step2[4].val[1], step2[11].val[1]);
+ out[12].val[0] = vsubq_s32(step2[3].val[0], step2[12].val[0]);
+ out[12].val[1] = vsubq_s32(step2[3].val[1], step2[12].val[1]);
+ out[13].val[0] = vsubq_s32(step2[2].val[0], step2[13].val[0]);
+ out[13].val[1] = vsubq_s32(step2[2].val[1], step2[13].val[1]);
+ out[14].val[0] = vsubq_s32(step2[1].val[0], step2[14].val[0]);
+ out[14].val[1] = vsubq_s32(step2[1].val[1], step2[14].val[1]);
+ out[15].val[0] = vsubq_s32(step2[0].val[0], step2[15].val[0]);
+ out[15].val[1] = vsubq_s32(step2[0].val[1], step2[15].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2,
+ int32x4_t *const out) {
+ out[0] = vaddq_s32(step2[0], step2[15]);
+ out[1] = vaddq_s32(step2[1], step2[14]);
+ out[2] = vaddq_s32(step2[2], step2[13]);
+ out[3] = vaddq_s32(step2[3], step2[12]);
+ out[4] = vaddq_s32(step2[4], step2[11]);
+ out[5] = vaddq_s32(step2[5], step2[10]);
+ out[6] = vaddq_s32(step2[6], step2[9]);
+ out[7] = vaddq_s32(step2[7], step2[8]);
+ out[8] = vsubq_s32(step2[7], step2[8]);
+ out[9] = vsubq_s32(step2[6], step2[9]);
+ out[10] = vsubq_s32(step2[5], step2[10]);
+ out[11] = vsubq_s32(step2[4], step2[11]);
+ out[12] = vsubq_s32(step2[3], step2[12]);
+ out[13] = vsubq_s32(step2[2], step2[13]);
+ out[14] = vsubq_s32(step2[1], step2[14]);
+ out[15] = vsubq_s32(step2[0], step2[15]);
+}
+
+void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
+ uint16_t *dest, const int stride,
+ const int bd) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4x2_t in[16], step1[16], step2[16], out[16];
+
+ // Load input (16x8)
+ in[0].val[0] = vld1q_s32(input);
+ in[0].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[8].val[0] = vld1q_s32(input);
+ in[8].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[1].val[0] = vld1q_s32(input);
+ in[1].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[9].val[0] = vld1q_s32(input);
+ in[9].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[2].val[0] = vld1q_s32(input);
+ in[2].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[10].val[0] = vld1q_s32(input);
+ in[10].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[3].val[0] = vld1q_s32(input);
+ in[3].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[11].val[0] = vld1q_s32(input);
+ in[11].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[4].val[0] = vld1q_s32(input);
+ in[4].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[12].val[0] = vld1q_s32(input);
+ in[12].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[5].val[0] = vld1q_s32(input);
+ in[5].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[13].val[0] = vld1q_s32(input);
+ in[13].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[6].val[0] = vld1q_s32(input);
+ in[6].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[14].val[0] = vld1q_s32(input);
+ in[14].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[7].val[0] = vld1q_s32(input);
+ in[7].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[15].val[0] = vld1q_s32(input);
+ in[15].val[1] = vld1q_s32(input + 4);
+
+ // Transpose
+ transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+ transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+ &in[15]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[1] = in[16 / 2];
+ step1[2] = in[8 / 2];
+ step1[3] = in[24 / 2];
+ step1[4] = in[4 / 2];
+ step1[5] = in[20 / 2];
+ step1[6] = in[12 / 2];
+ step1[7] = in[28 / 2];
+ step1[8] = in[2 / 2];
+ step1[9] = in[18 / 2];
+ step1[10] = in[10 / 2];
+ step1[11] = in[26 / 2];
+ step1[12] = in[6 / 2];
+ step1[13] = in[22 / 2];
+ step1[14] = in[14 / 2];
+ step1[15] = in[30 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+ highbd_idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8],
+ &step2[15]);
+ highbd_idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ highbd_idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4],
+ &step1[7]);
+ highbd_idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5],
+ &step1[6]);
+ step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[9].val[0]);
+ step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[9].val[1]);
+ step1[9].val[0] = vsubq_s32(step2[8].val[0], step2[9].val[0]);
+ step1[9].val[1] = vsubq_s32(step2[8].val[1], step2[9].val[1]);
+ step1[10].val[0] = vsubq_s32(step2[11].val[0], step2[10].val[0]);
+ step1[10].val[1] = vsubq_s32(step2[11].val[1], step2[10].val[1]);
+ step1[11].val[0] = vaddq_s32(step2[11].val[0], step2[10].val[0]);
+ step1[11].val[1] = vaddq_s32(step2[11].val[1], step2[10].val[1]);
+ step1[12].val[0] = vaddq_s32(step2[12].val[0], step2[13].val[0]);
+ step1[12].val[1] = vaddq_s32(step2[12].val[1], step2[13].val[1]);
+ step1[13].val[0] = vsubq_s32(step2[12].val[0], step2[13].val[0]);
+ step1[13].val[1] = vsubq_s32(step2[12].val[1], step2[13].val[1]);
+ step1[14].val[0] = vsubq_s32(step2[15].val[0], step2[14].val[0]);
+ step1[14].val[1] = vsubq_s32(step2[15].val[1], step2[14].val[1]);
+ step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[14].val[0]);
+ step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[14].val[1]);
+
+ // stage 4
+ highbd_idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1],
+ &step2[0]);
+ highbd_idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2],
+ &step2[3]);
+ step2[4].val[0] = vaddq_s32(step1[4].val[0], step1[5].val[0]);
+ step2[4].val[1] = vaddq_s32(step1[4].val[1], step1[5].val[1]);
+ step2[5].val[0] = vsubq_s32(step1[4].val[0], step1[5].val[0]);
+ step2[5].val[1] = vsubq_s32(step1[4].val[1], step1[5].val[1]);
+ step2[6].val[0] = vsubq_s32(step1[7].val[0], step1[6].val[0]);
+ step2[6].val[1] = vsubq_s32(step1[7].val[1], step1[6].val[1]);
+ step2[7].val[0] = vaddq_s32(step1[7].val[0], step1[6].val[0]);
+ step2[7].val[1] = vaddq_s32(step1[7].val[1], step1[6].val[1]);
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0].val[0] = vaddq_s32(step2[0].val[0], step2[3].val[0]);
+ step1[0].val[1] = vaddq_s32(step2[0].val[1], step2[3].val[1]);
+ step1[1].val[0] = vaddq_s32(step2[1].val[0], step2[2].val[0]);
+ step1[1].val[1] = vaddq_s32(step2[1].val[1], step2[2].val[1]);
+ step1[2].val[0] = vsubq_s32(step2[1].val[0], step2[2].val[0]);
+ step1[2].val[1] = vsubq_s32(step2[1].val[1], step2[2].val[1]);
+ step1[3].val[0] = vsubq_s32(step2[0].val[0], step2[3].val[0]);
+ step1[3].val[1] = vsubq_s32(step2[0].val[1], step2[3].val[1]);
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[11].val[0]);
+ step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[11].val[1]);
+ step1[9].val[0] = vaddq_s32(step2[9].val[0], step2[10].val[0]);
+ step1[9].val[1] = vaddq_s32(step2[9].val[1], step2[10].val[1]);
+ step1[10].val[0] = vsubq_s32(step2[9].val[0], step2[10].val[0]);
+ step1[10].val[1] = vsubq_s32(step2[9].val[1], step2[10].val[1]);
+ step1[11].val[0] = vsubq_s32(step2[8].val[0], step2[11].val[0]);
+ step1[11].val[1] = vsubq_s32(step2[8].val[1], step2[11].val[1]);
+ step1[12].val[0] = vsubq_s32(step2[15].val[0], step2[12].val[0]);
+ step1[12].val[1] = vsubq_s32(step2[15].val[1], step2[12].val[1]);
+ step1[13].val[0] = vsubq_s32(step2[14].val[0], step2[13].val[0]);
+ step1[13].val[1] = vsubq_s32(step2[14].val[1], step2[13].val[1]);
+ step1[14].val[0] = vaddq_s32(step2[14].val[0], step2[13].val[0]);
+ step1[14].val[1] = vaddq_s32(step2[14].val[1], step2[13].val[1]);
+ step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[12].val[0]);
+ step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[12].val[1]);
+
+ // stage 6
+ step2[0].val[0] = vaddq_s32(step1[0].val[0], step1[7].val[0]);
+ step2[0].val[1] = vaddq_s32(step1[0].val[1], step1[7].val[1]);
+ step2[1].val[0] = vaddq_s32(step1[1].val[0], step1[6].val[0]);
+ step2[1].val[1] = vaddq_s32(step1[1].val[1], step1[6].val[1]);
+ step2[2].val[0] = vaddq_s32(step1[2].val[0], step1[5].val[0]);
+ step2[2].val[1] = vaddq_s32(step1[2].val[1], step1[5].val[1]);
+ step2[3].val[0] = vaddq_s32(step1[3].val[0], step1[4].val[0]);
+ step2[3].val[1] = vaddq_s32(step1[3].val[1], step1[4].val[1]);
+ step2[4].val[0] = vsubq_s32(step1[3].val[0], step1[4].val[0]);
+ step2[4].val[1] = vsubq_s32(step1[3].val[1], step1[4].val[1]);
+ step2[5].val[0] = vsubq_s32(step1[2].val[0], step1[5].val[0]);
+ step2[5].val[1] = vsubq_s32(step1[2].val[1], step1[5].val[1]);
+ step2[6].val[0] = vsubq_s32(step1[1].val[0], step1[6].val[0]);
+ step2[6].val[1] = vsubq_s32(step1[1].val[1], step1[6].val[1]);
+ step2[7].val[0] = vsubq_s32(step1[0].val[0], step1[7].val[0]);
+ step2[7].val[1] = vsubq_s32(step1[0].val[1], step1[7].val[1]);
+ highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7_dual(step2, out);
+
+ if (output) {
+ highbd_idct16x16_store_pass1(out, output);
+ } else {
+ highbd_idct16x16_add_store(out, dest, stride, bd);
+ }
+}
+
+static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t[2];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 0);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0);
+ return dct_const_round_shift_high_4x2_int64x2x2(t);
+}
+
+static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t;
+
+ t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0);
+ t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0);
+ return dct_const_round_shift_high_4(t);
+}
+
+static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t[2];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1);
+ return dct_const_round_shift_high_4x2_int64x2x2(t);
+}
+
+static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t;
+
+ t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1);
+ t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1);
+ return dct_const_round_shift_high_4(t);
+}
+
+static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input,
+ int32_t *output, uint16_t *dest,
+ const int stride, const int bd) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4x2_t in[8], step1[16], step2[16], out[16];
+
+ // Load input (8x8)
+ in[0].val[0] = vld1q_s32(input);
+ in[0].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[1].val[0] = vld1q_s32(input);
+ in[1].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[2].val[0] = vld1q_s32(input);
+ in[2].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[3].val[0] = vld1q_s32(input);
+ in[3].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[4].val[0] = vld1q_s32(input);
+ in[4].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[5].val[0] = vld1q_s32(input);
+ in[5].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[6].val[0] = vld1q_s32(input);
+ in[6].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[7].val[0] = vld1q_s32(input);
+ in[7].val[1] = vld1q_s32(input + 4);
+
+ // Transpose
+ transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[2] = in[8 / 2];
+ step1[4] = in[4 / 2];
+ step1[6] = in[12 / 2];
+ step1[8] = in[2 / 2];
+ step1[10] = in[10 / 2];
+ step1[12] = in[6 / 2];
+ step1[14] = in[14 / 2]; // 0 in pass 1
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[4] = step1[4];
+ step2[6] = step1[6];
+ step2[8] =
+ highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+ step2[9] = highbd_idct_cospi_lane1_dual(step1[14],
+ vget_high_s32(cospi_6_26N_14_18N));
+ step2[10] =
+ highbd_idct_cospi_lane1_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
+ step2[11] =
+ highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[12] =
+ highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[13] =
+ highbd_idct_cospi_lane0_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
+ step2[14] = highbd_idct_cospi_lane0_dual(step1[14],
+ vget_high_s32(cospi_6_26N_14_18N));
+ step2[15] =
+ highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] =
+ highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
+ step1[5] =
+ highbd_idct_cospi_lane0_dual(step2[6], vget_high_s32(cospi_4_12_20N_28));
+ step1[6] =
+ highbd_idct_cospi_lane1_dual(step2[6], vget_low_s32(cospi_4_12_20N_28));
+ step1[7] =
+ highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
+ step1[8] = highbd_idct_add_dual(step2[8], step2[9]);
+ step1[9] = highbd_idct_sub_dual(step2[8], step2[9]);
+ step1[10] = highbd_idct_sub_dual(step2[11], step2[10]);
+ step1[11] = highbd_idct_add_dual(step2[11], step2[10]);
+ step1[12] = highbd_idct_add_dual(step2[12], step2[13]);
+ step1[13] = highbd_idct_sub_dual(step2[12], step2[13]);
+ step1[14] = highbd_idct_sub_dual(step2[15], step2[14]);
+ step1[15] = highbd_idct_add_dual(step2[15], step2[14]);
+
+ // stage 4
+ step2[0] = step2[1] =
+ highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
+ step2[2] =
+ highbd_idct_cospi_lane1_dual(step1[2], vget_high_s32(cospi_0_8_16_24));
+ step2[3] =
+ highbd_idct_cospi_lane1_dual(step1[2], vget_low_s32(cospi_0_8_16_24));
+ step2[4] = highbd_idct_add_dual(step1[4], step1[5]);
+ step2[5] = highbd_idct_sub_dual(step1[4], step1[5]);
+ step2[6] = highbd_idct_sub_dual(step1[7], step1[6]);
+ step2[7] = highbd_idct_add_dual(step1[7], step1[6]);
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = highbd_idct_add_dual(step2[0], step2[3]);
+ step1[1] = highbd_idct_add_dual(step2[1], step2[2]);
+ step1[2] = highbd_idct_sub_dual(step2[1], step2[2]);
+ step1[3] = highbd_idct_sub_dual(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
+ step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
+ step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
+ step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
+ step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
+ step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
+ step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
+ step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
+ step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
+ step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
+ step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
+ step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
+ step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
+ step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
+ step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
+ highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7_dual(step2, out);
+
+ if (output) {
+ highbd_idct16x16_store_pass1(out, output);
+ } else {
+ highbd_idct16x16_add_store(out, dest, stride, bd);
+ }
+}
+
+static void highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+ int32_t *output) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4_t in[4], step1[16], step2[16], out[16];
+
+ // Load input (4x4)
+ in[0] = vld1q_s32(input);
+ input += 16;
+ in[1] = vld1q_s32(input);
+ input += 16;
+ in[2] = vld1q_s32(input);
+ input += 16;
+ in[3] = vld1q_s32(input);
+
+ // Transpose
+ transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] = highbd_idct_cospi_lane1(step1[8], vget_low_s32(cospi_2_30_10_22));
+ step2[11] =
+ highbd_idct_cospi_lane1(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[12] =
+ highbd_idct_cospi_lane0(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[15] = highbd_idct_cospi_lane0(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] =
+ highbd_idct_cospi_lane1(step2[4], vget_high_s32(cospi_4_12_20N_28));
+ step1[7] = highbd_idct_cospi_lane0(step2[4], vget_low_s32(cospi_4_12_20N_28));
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ step2[0] = step2[1] =
+ highbd_idct_cospi_lane0(step1[0], vget_high_s32(cospi_0_8_16_24));
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s32(step2[8], step2[11]);
+ step1[9] = vaddq_s32(step2[9], step2[10]);
+ step1[10] = vsubq_s32(step2[9], step2[10]);
+ step1[11] = vsubq_s32(step2[8], step2[11]);
+ step1[12] = vsubq_s32(step2[15], step2[12]);
+ step1[13] = vsubq_s32(step2[14], step2[13]);
+ step1[14] = vaddq_s32(step2[14], step2[13]);
+ step1[15] = vaddq_s32(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vaddq_s32(step1[0], step1[7]);
+ step2[1] = vaddq_s32(step1[1], step1[6]);
+ step2[2] = vaddq_s32(step1[2], step1[5]);
+ step2[3] = vaddq_s32(step1[3], step1[4]);
+ step2[4] = vsubq_s32(step1[3], step1[4]);
+ step2[5] = vsubq_s32(step1[2], step1[5]);
+ step2[6] = vsubq_s32(step1[1], step1[6]);
+ step2[7] = vsubq_s32(step1[0], step1[7]);
+ highbd_idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7(step2, out);
+
+ // pass 1: save the result into output
+ vst1q_s32(output, out[0]);
+ output += 4;
+ vst1q_s32(output, out[1]);
+ output += 4;
+ vst1q_s32(output, out[2]);
+ output += 4;
+ vst1q_s32(output, out[3]);
+ output += 4;
+ vst1q_s32(output, out[4]);
+ output += 4;
+ vst1q_s32(output, out[5]);
+ output += 4;
+ vst1q_s32(output, out[6]);
+ output += 4;
+ vst1q_s32(output, out[7]);
+ output += 4;
+ vst1q_s32(output, out[8]);
+ output += 4;
+ vst1q_s32(output, out[9]);
+ output += 4;
+ vst1q_s32(output, out[10]);
+ output += 4;
+ vst1q_s32(output, out[11]);
+ output += 4;
+ vst1q_s32(output, out[12]);
+ output += 4;
+ vst1q_s32(output, out[13]);
+ output += 4;
+ vst1q_s32(output, out[14]);
+ output += 4;
+ vst1q_s32(output, out[15]);
+}
+
+static void highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
+ int32_t *const output,
+ uint16_t *const dest,
+ const int stride,
+ const int bd) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4x2_t in[4], step1[16], step2[16], out[16];
+
+ // Load input (4x8)
+ in[0].val[0] = vld1q_s32(input);
+ input += 4;
+ in[0].val[1] = vld1q_s32(input);
+ input += 4;
+ in[1].val[0] = vld1q_s32(input);
+ input += 4;
+ in[1].val[1] = vld1q_s32(input);
+ input += 4;
+ in[2].val[0] = vld1q_s32(input);
+ input += 4;
+ in[2].val[1] = vld1q_s32(input);
+ input += 4;
+ in[3].val[0] = vld1q_s32(input);
+ input += 4;
+ in[3].val[1] = vld1q_s32(input);
+
+ // Transpose
+ transpose_s32_4x8(&in[0].val[0], &in[0].val[1], &in[1].val[0], &in[1].val[1],
+ &in[2].val[0], &in[2].val[1], &in[3].val[0], &in[3].val[1]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] =
+ highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+ step2[11] =
+ highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[12] =
+ highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[15] =
+ highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] =
+ highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
+ step1[7] =
+ highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ step2[0] = step2[1] =
+ highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
+ step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
+ step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
+ step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
+ step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
+ step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
+ step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
+ step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
+ step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
+ step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
+ step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
+ step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
+ step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
+ step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
+ step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
+ highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7_dual(step2, out);
+
+ if (output) {
+ highbd_idct16x16_store_pass1(out, output);
+ } else {
+ highbd_idct16x16_add_store(out, dest, stride, bd);
+ }
+}
+
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ if (bd == 8) {
+ int16_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 1);
+
+ // Parallel idct on the lower 8 rows
+ vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
+ stride, 1);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 1);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8,
+ stride, 1);
+ } else {
+ int32_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_highbd_idct16x16_256_add_half1d(input, row_idct_output, dest, stride,
+ bd);
+
+ // Parallel idct on the lower 8 rows
+ vpx_highbd_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8,
+ dest, stride, bd);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_highbd_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride,
+ bd);
+
+ // Parallel idct to get the right 8 columns
+ vpx_highbd_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL,
+ dest + 8, stride, bd);
+ }
+}
+
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ if (bd == 8) {
+ int16_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 1);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 1);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
+ stride, 1);
+ } else {
+ int32_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_highbd_idct16x16_38_add_half1d(input, row_idct_output, dest, stride,
+ bd);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_highbd_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, bd);
+
+ // Parallel idct to get the right 8 columns
+ vpx_highbd_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
+ stride, bd);
+ }
+}
+
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ if (bd == 8) {
+ int16_t row_idct_output[4 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 1);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
+ stride, 1);
+ } else {
+ int32_t row_idct_output[4 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride,
+ bd);
+
+ // Parallel idct to get the right 8 columns
+ highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL,
+ dest + 8, stride, bd);
+ }
+}
+
+static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x8_t a0 = vld1q_u16(*dest + 0);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t c0 = vminq_s16(b0, max);
+ const int16x8_t c1 = vminq_s16(b1, max);
+ vst1q_u16(*dest + 0, vreinterpretq_u16_s16(c0));
+ vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a0 = vld1q_u16(*dest + 0);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+ const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+ vst1q_u16(*dest + 0, c0);
+ vst1q_u16(*dest + 8, c1);
+ *dest += stride;
+}
+
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ int i;
+
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ for (i = 0; i < 4; ++i) {
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ }
+ } else {
+ for (i = 0; i < 4; ++i) {
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
new file mode 100644
index 0000000000..5b36f73367
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
@@ -0,0 +1,640 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_from_transformed(const int32_t *const trans_buf,
+ const int first, const int second,
+ int32x4x2_t *const q0,
+ int32x4x2_t *const q1) {
+ q0->val[0] = vld1q_s32(trans_buf + first * 8);
+ q0->val[1] = vld1q_s32(trans_buf + first * 8 + 4);
+ q1->val[0] = vld1q_s32(trans_buf + second * 8);
+ q1->val[1] = vld1q_s32(trans_buf + second * 8 + 4);
+}
+
+static INLINE void load_from_output(const int32_t *const out, const int first,
+ const int second, int32x4x2_t *const q0,
+ int32x4x2_t *const q1) {
+ q0->val[0] = vld1q_s32(out + first * 32);
+ q0->val[1] = vld1q_s32(out + first * 32 + 4);
+ q1->val[0] = vld1q_s32(out + second * 32);
+ q1->val[1] = vld1q_s32(out + second * 32 + 4);
+}
+
+static INLINE void store_in_output(int32_t *const out, const int first,
+ const int second, const int32x4x2_t q0,
+ const int32x4x2_t q1) {
+ vst1q_s32(out + first * 32, q0.val[0]);
+ vst1q_s32(out + first * 32 + 4, q0.val[1]);
+ vst1q_s32(out + second * 32, q1.val[0]);
+ vst1q_s32(out + second * 32 + 4, q1.val[1]);
+}
+
+static INLINE void highbd_store_combine_results(
+ uint16_t *p1, uint16_t *p2, const int stride, const int32x4x2_t q0,
+ const int32x4x2_t q1, const int32x4x2_t q2, const int32x4x2_t q3,
+ const int16x8_t max) {
+ int16x8_t o[4];
+ uint16x8_t d[4];
+
+ d[0] = vld1q_u16(p1);
+ p1 += stride;
+ d[1] = vld1q_u16(p1);
+ d[3] = vld1q_u16(p2);
+ p2 -= stride;
+ d[2] = vld1q_u16(p2);
+
+ o[0] = vcombine_s16(vrshrn_n_s32(q0.val[0], 6), vrshrn_n_s32(q0.val[1], 6));
+ o[1] = vcombine_s16(vrshrn_n_s32(q1.val[0], 6), vrshrn_n_s32(q1.val[1], 6));
+ o[2] = vcombine_s16(vrshrn_n_s32(q2.val[0], 6), vrshrn_n_s32(q2.val[1], 6));
+ o[3] = vcombine_s16(vrshrn_n_s32(q3.val[0], 6), vrshrn_n_s32(q3.val[1], 6));
+
+ o[0] = vqaddq_s16(o[0], vreinterpretq_s16_u16(d[0]));
+ o[1] = vqaddq_s16(o[1], vreinterpretq_s16_u16(d[1]));
+ o[2] = vqaddq_s16(o[2], vreinterpretq_s16_u16(d[2]));
+ o[3] = vqaddq_s16(o[3], vreinterpretq_s16_u16(d[3]));
+ o[0] = vminq_s16(o[0], max);
+ o[1] = vminq_s16(o[1], max);
+ o[2] = vminq_s16(o[2], max);
+ o[3] = vminq_s16(o[3], max);
+ d[0] = vqshluq_n_s16(o[0], 0);
+ d[1] = vqshluq_n_s16(o[1], 0);
+ d[2] = vqshluq_n_s16(o[2], 0);
+ d[3] = vqshluq_n_s16(o[3], 0);
+
+ vst1q_u16(p1, d[1]);
+ p1 -= stride;
+ vst1q_u16(p1, d[0]);
+ vst1q_u16(p2, d[2]);
+ p2 += stride;
+ vst1q_u16(p2, d[3]);
+}
+
+static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1,
+ const int32_t first_const,
+ const int32_t second_const,
+ int32x4x2_t *const qOut0,
+ int32x4x2_t *const qOut1) {
+ int64x2x2_t q[4];
+ int32x2_t d[6];
+
+ // Note: using v{mul, mla, mls}l_n_s32 here slows down 35% with gcc 4.9.
+ d[4] = vdup_n_s32(first_const);
+ d[5] = vdup_n_s32(second_const);
+
+ q[0].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[4]);
+ q[0].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[4]);
+ q[1].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[4]);
+ q[1].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[4]);
+ q[0].val[0] = vmlsl_s32(q[0].val[0], vget_low_s32(qIn1.val[0]), d[5]);
+ q[0].val[1] = vmlsl_s32(q[0].val[1], vget_high_s32(qIn1.val[0]), d[5]);
+ q[1].val[0] = vmlsl_s32(q[1].val[0], vget_low_s32(qIn1.val[1]), d[5]);
+ q[1].val[1] = vmlsl_s32(q[1].val[1], vget_high_s32(qIn1.val[1]), d[5]);
+
+ q[2].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[5]);
+ q[2].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[5]);
+ q[3].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[5]);
+ q[3].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[5]);
+ q[2].val[0] = vmlal_s32(q[2].val[0], vget_low_s32(qIn1.val[0]), d[4]);
+ q[2].val[1] = vmlal_s32(q[2].val[1], vget_high_s32(qIn1.val[0]), d[4]);
+ q[3].val[0] = vmlal_s32(q[3].val[0], vget_low_s32(qIn1.val[1]), d[4]);
+ q[3].val[1] = vmlal_s32(q[3].val[1], vget_high_s32(qIn1.val[1]), d[4]);
+
+ qOut0->val[0] = vcombine_s32(vrshrn_n_s64(q[0].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(q[0].val[1], DCT_CONST_BITS));
+ qOut0->val[1] = vcombine_s32(vrshrn_n_s64(q[1].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(q[1].val[1], DCT_CONST_BITS));
+ qOut1->val[0] = vcombine_s32(vrshrn_n_s64(q[2].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(q[2].val[1], DCT_CONST_BITS));
+ qOut1->val[1] = vcombine_s32(vrshrn_n_s64(q[3].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS));
+}
+
+static INLINE void load_s32x4q_dual(const int32_t *in, int32x4x2_t *const s) {
+ s[0].val[0] = vld1q_s32(in);
+ s[0].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[1].val[0] = vld1q_s32(in);
+ s[1].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[2].val[0] = vld1q_s32(in);
+ s[2].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[3].val[0] = vld1q_s32(in);
+ s[3].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[4].val[0] = vld1q_s32(in);
+ s[4].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[5].val[0] = vld1q_s32(in);
+ s[5].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[6].val[0] = vld1q_s32(in);
+ s[6].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[7].val[0] = vld1q_s32(in);
+ s[7].val[1] = vld1q_s32(in + 4);
+}
+
+static INLINE void transpose_and_store_s32_8x8(int32x4x2_t *const a,
+ int32_t **out) {
+ transpose_s32_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+
+ vst1q_s32(*out, a[0].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[0].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[1].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[1].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[2].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[2].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[3].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[3].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[4].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[4].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[5].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[5].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[6].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[6].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[7].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[7].val[1]);
+ *out += 4;
+}
+
+static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) {
+ int i;
+ int32x4x2_t s[8];
+
+ for (i = 0; i < 4; i++, input += 8) {
+ load_s32x4q_dual(input, s);
+ transpose_and_store_s32_8x8(s, &t_buf);
+ }
+}
+
+static INLINE void idct32_bands_end_1st_pass(int32_t *const out,
+ int32x4x2_t *const q) {
+ store_in_output(out, 16, 17, q[6], q[7]);
+ store_in_output(out, 14, 15, q[8], q[9]);
+
+ load_from_output(out, 30, 31, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ store_in_output(out, 30, 31, q[6], q[7]);
+ store_in_output(out, 0, 1, q[4], q[5]);
+
+ load_from_output(out, 12, 13, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[10], q[1]);
+ q[3] = highbd_idct_add_dual(q[11], q[0]);
+ q[4] = highbd_idct_sub_dual(q[11], q[0]);
+ q[5] = highbd_idct_sub_dual(q[10], q[1]);
+
+ load_from_output(out, 18, 19, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ store_in_output(out, 18, 19, q[6], q[7]);
+ store_in_output(out, 12, 13, q[8], q[9]);
+
+ load_from_output(out, 28, 29, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ store_in_output(out, 28, 29, q[6], q[7]);
+ store_in_output(out, 2, 3, q[4], q[5]);
+
+ load_from_output(out, 10, 11, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[12], q[1]);
+ q[3] = highbd_idct_add_dual(q[13], q[0]);
+ q[4] = highbd_idct_sub_dual(q[13], q[0]);
+ q[5] = highbd_idct_sub_dual(q[12], q[1]);
+
+ load_from_output(out, 20, 21, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ store_in_output(out, 20, 21, q[6], q[7]);
+ store_in_output(out, 10, 11, q[8], q[9]);
+
+ load_from_output(out, 26, 27, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ store_in_output(out, 26, 27, q[6], q[7]);
+ store_in_output(out, 4, 5, q[4], q[5]);
+
+ load_from_output(out, 8, 9, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[14], q[1]);
+ q[3] = highbd_idct_add_dual(q[15], q[0]);
+ q[4] = highbd_idct_sub_dual(q[15], q[0]);
+ q[5] = highbd_idct_sub_dual(q[14], q[1]);
+
+ load_from_output(out, 22, 23, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ store_in_output(out, 22, 23, q[6], q[7]);
+ store_in_output(out, 8, 9, q[8], q[9]);
+
+ load_from_output(out, 24, 25, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ store_in_output(out, 24, 25, q[6], q[7]);
+ store_in_output(out, 6, 7, q[4], q[5]);
+}
+
+static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out,
+ uint16_t *const dest,
+ const int stride,
+ const int16x8_t max,
+ int32x4x2_t *const q) {
+ uint16_t *dest0 = dest + 0 * stride;
+ uint16_t *dest1 = dest + 31 * stride;
+ uint16_t *dest2 = dest + 16 * stride;
+ uint16_t *dest3 = dest + 15 * stride;
+ const int str2 = stride << 1;
+
+ highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+ max);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 30, 31, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+ max);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 12, 13, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[10], q[1]);
+ q[3] = highbd_idct_add_dual(q[11], q[0]);
+ q[4] = highbd_idct_sub_dual(q[11], q[0]);
+ q[5] = highbd_idct_sub_dual(q[10], q[1]);
+
+ load_from_output(out, 18, 19, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+ max);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 28, 29, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+ max);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 10, 11, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[12], q[1]);
+ q[3] = highbd_idct_add_dual(q[13], q[0]);
+ q[4] = highbd_idct_sub_dual(q[13], q[0]);
+ q[5] = highbd_idct_sub_dual(q[12], q[1]);
+
+ load_from_output(out, 20, 21, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+ max);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 26, 27, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+ max);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 8, 9, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[14], q[1]);
+ q[3] = highbd_idct_add_dual(q[15], q[0]);
+ q[4] = highbd_idct_sub_dual(q[15], q[0]);
+ q[5] = highbd_idct_sub_dual(q[14], q[1]);
+
+ load_from_output(out, 22, 23, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+ max);
+
+ load_from_output(out, 24, 25, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+ max);
+}
+
+static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
+ uint16_t *dst, const int stride,
+ const int bd) {
+ int i, idct32_pass_loop;
+ int32_t trans_buf[32 * 8];
+ int32_t pass1[32 * 32];
+ int32_t pass2[32 * 32];
+ int32_t *out;
+ int32x4x2_t q[16];
+
+ for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
+ idct32_pass_loop++, input = pass1, out = pass2) {
+ for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop
+ idct32_transpose_pair(input, trans_buf);
+ input += 32 * 8;
+
+ // -----------------------------------------
+ // BLOCK A: 16-19,28-31
+ // -----------------------------------------
+ // generate 16,17,30,31
+ // part of stage 1
+ load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]);
+ // part of stage 2
+ q[4] = highbd_idct_add_dual(q[0], q[1]);
+ q[13] = highbd_idct_sub_dual(q[0], q[1]);
+ q[6] = highbd_idct_add_dual(q[2], q[3]);
+ q[14] = highbd_idct_sub_dual(q[2], q[3]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]);
+
+ // generate 18,19,28,29
+ // part of stage 1
+ load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]);
+ // part of stage 2
+ q[13] = highbd_idct_sub_dual(q[3], q[2]);
+ q[3] = highbd_idct_add_dual(q[3], q[2]);
+ q[14] = highbd_idct_sub_dual(q[1], q[0]);
+ q[2] = highbd_idct_add_dual(q[1], q[0]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]);
+ // part of stage 4
+ q[8] = highbd_idct_add_dual(q[4], q[2]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[10] = highbd_idct_add_dual(q[7], q[1]);
+ q[15] = highbd_idct_add_dual(q[6], q[3]);
+ q[13] = highbd_idct_sub_dual(q[5], q[0]);
+ q[14] = highbd_idct_sub_dual(q[7], q[1]);
+ store_in_output(out, 16, 31, q[8], q[15]);
+ store_in_output(out, 17, 30, q[9], q[10]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]);
+ store_in_output(out, 29, 18, q[1], q[0]);
+ // part of stage 4
+ q[13] = highbd_idct_sub_dual(q[4], q[2]);
+ q[14] = highbd_idct_sub_dual(q[6], q[3]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]);
+ store_in_output(out, 19, 28, q[4], q[6]);
+
+ // -----------------------------------------
+ // BLOCK B: 20-23,24-27
+ // -----------------------------------------
+ // generate 20,21,26,27
+ // part of stage 1
+ load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]);
+ // part of stage 2
+ q[13] = highbd_idct_sub_dual(q[0], q[1]);
+ q[0] = highbd_idct_add_dual(q[0], q[1]);
+ q[14] = highbd_idct_sub_dual(q[2], q[3]);
+ q[2] = highbd_idct_add_dual(q[2], q[3]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+
+ // generate 22,23,24,25
+ // part of stage 1
+ load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]);
+ // part of stage 2
+ q[14] = highbd_idct_sub_dual(q[4], q[5]);
+ q[5] = highbd_idct_add_dual(q[4], q[5]);
+ q[13] = highbd_idct_sub_dual(q[6], q[7]);
+ q[6] = highbd_idct_add_dual(q[6], q[7]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]);
+ // part of stage 4
+ q[10] = highbd_idct_add_dual(q[7], q[1]);
+ q[11] = highbd_idct_add_dual(q[5], q[0]);
+ q[12] = highbd_idct_add_dual(q[6], q[2]);
+ q[15] = highbd_idct_add_dual(q[4], q[3]);
+ // part of stage 6
+ load_from_output(out, 16, 17, &q[14], &q[13]);
+ q[8] = highbd_idct_add_dual(q[14], q[11]);
+ q[9] = highbd_idct_add_dual(q[13], q[10]);
+ q[13] = highbd_idct_sub_dual(q[13], q[10]);
+ q[11] = highbd_idct_sub_dual(q[14], q[11]);
+ store_in_output(out, 17, 16, q[9], q[8]);
+ load_from_output(out, 30, 31, &q[14], &q[9]);
+ q[8] = highbd_idct_sub_dual(q[9], q[12]);
+ q[10] = highbd_idct_add_dual(q[14], q[15]);
+ q[14] = highbd_idct_sub_dual(q[14], q[15]);
+ q[12] = highbd_idct_add_dual(q[9], q[12]);
+ store_in_output(out, 30, 31, q[10], q[12]);
+ // part of stage 7
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 25, 22, q[14], q[13]);
+ do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 24, 23, q[14], q[13]);
+ // part of stage 4
+ q[14] = highbd_idct_sub_dual(q[5], q[0]);
+ q[13] = highbd_idct_sub_dual(q[6], q[2]);
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]);
+ q[14] = highbd_idct_sub_dual(q[7], q[1]);
+ q[13] = highbd_idct_sub_dual(q[4], q[3]);
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]);
+ // part of stage 6
+ load_from_output(out, 18, 19, &q[14], &q[13]);
+ q[8] = highbd_idct_add_dual(q[14], q[1]);
+ q[9] = highbd_idct_add_dual(q[13], q[6]);
+ q[13] = highbd_idct_sub_dual(q[13], q[6]);
+ q[1] = highbd_idct_sub_dual(q[14], q[1]);
+ store_in_output(out, 18, 19, q[8], q[9]);
+ load_from_output(out, 28, 29, &q[8], &q[9]);
+ q[14] = highbd_idct_sub_dual(q[8], q[5]);
+ q[10] = highbd_idct_add_dual(q[8], q[5]);
+ q[11] = highbd_idct_add_dual(q[9], q[0]);
+ q[0] = highbd_idct_sub_dual(q[9], q[0]);
+ store_in_output(out, 28, 29, q[10], q[11]);
+ // part of stage 7
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 20, 27, q[13], q[14]);
+ do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]);
+ store_in_output(out, 21, 26, q[1], q[0]);
+
+ // -----------------------------------------
+ // BLOCK C: 8-10,11-15
+ // -----------------------------------------
+ // generate 8,9,14,15
+ // part of stage 2
+ load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]);
+ // part of stage 3
+ q[13] = highbd_idct_sub_dual(q[0], q[1]);
+ q[0] = highbd_idct_add_dual(q[0], q[1]);
+ q[14] = highbd_idct_sub_dual(q[2], q[3]);
+ q[2] = highbd_idct_add_dual(q[2], q[3]);
+ // part of stage 4
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]);
+
+ // generate 10,11,12,13
+ // part of stage 2
+ load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]);
+ // part of stage 3
+ q[14] = highbd_idct_sub_dual(q[4], q[5]);
+ q[5] = highbd_idct_add_dual(q[4], q[5]);
+ q[13] = highbd_idct_sub_dual(q[6], q[7]);
+ q[6] = highbd_idct_add_dual(q[6], q[7]);
+ // part of stage 4
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]);
+ // part of stage 5
+ q[8] = highbd_idct_add_dual(q[0], q[5]);
+ q[9] = highbd_idct_add_dual(q[1], q[7]);
+ q[13] = highbd_idct_sub_dual(q[1], q[7]);
+ q[14] = highbd_idct_sub_dual(q[3], q[4]);
+ q[10] = highbd_idct_add_dual(q[3], q[4]);
+ q[15] = highbd_idct_add_dual(q[2], q[6]);
+ store_in_output(out, 8, 15, q[8], q[15]);
+ store_in_output(out, 9, 14, q[9], q[10]);
+ // part of stage 6
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+ store_in_output(out, 13, 10, q[3], q[1]);
+ q[13] = highbd_idct_sub_dual(q[0], q[5]);
+ q[14] = highbd_idct_sub_dual(q[2], q[6]);
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+ store_in_output(out, 11, 12, q[1], q[3]);
+
+ // -----------------------------------------
+ // BLOCK D: 0-3,4-7
+ // -----------------------------------------
+ // generate 4,5,6,7
+ // part of stage 3
+ load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+ // part of stage 4
+ q[13] = highbd_idct_sub_dual(q[0], q[1]);
+ q[0] = highbd_idct_add_dual(q[0], q[1]);
+ q[14] = highbd_idct_sub_dual(q[2], q[3]);
+ q[2] = highbd_idct_add_dual(q[2], q[3]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+
+ // generate 0,1,2,3
+ // part of stage 4
+ load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]);
+ // part of stage 5
+ q[4] = highbd_idct_add_dual(q[7], q[6]);
+ q[7] = highbd_idct_sub_dual(q[7], q[6]);
+ q[6] = highbd_idct_sub_dual(q[5], q[14]);
+ q[5] = highbd_idct_add_dual(q[5], q[14]);
+ // part of stage 6
+ q[8] = highbd_idct_add_dual(q[4], q[2]);
+ q[9] = highbd_idct_add_dual(q[5], q[3]);
+ q[10] = highbd_idct_add_dual(q[6], q[1]);
+ q[11] = highbd_idct_add_dual(q[7], q[0]);
+ q[12] = highbd_idct_sub_dual(q[7], q[0]);
+ q[13] = highbd_idct_sub_dual(q[6], q[1]);
+ q[14] = highbd_idct_sub_dual(q[5], q[3]);
+ q[15] = highbd_idct_sub_dual(q[4], q[2]);
+ // part of stage 7
+ load_from_output(out, 14, 15, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[8], q[1]);
+ q[3] = highbd_idct_add_dual(q[9], q[0]);
+ q[4] = highbd_idct_sub_dual(q[9], q[0]);
+ q[5] = highbd_idct_sub_dual(q[8], q[1]);
+ load_from_output(out, 16, 17, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+
+ if (idct32_pass_loop == 0) {
+ idct32_bands_end_1st_pass(out, q);
+ } else {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ idct32_bands_end_2nd_pass(out, dst, stride, max, q);
+ dst += 8;
+ }
+ }
+ }
+}
+
+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ if (bd == 8) {
+ vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1);
+ } else {
+ vpx_highbd_idct32_32_neon(input, dest, stride, bd);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
new file mode 100644
index 0000000000..6750c1a426
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_8x8_s32_dual(
+ const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1,
+ int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4,
+ int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) {
+ in0->val[0] = vld1q_s32(input);
+ in0->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in1->val[0] = vld1q_s32(input);
+ in1->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in2->val[0] = vld1q_s32(input);
+ in2->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in3->val[0] = vld1q_s32(input);
+ in3->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in4->val[0] = vld1q_s32(input);
+ in4->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in5->val[0] = vld1q_s32(input);
+ in5->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in6->val[0] = vld1q_s32(input);
+ in6->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in7->val[0] = vld1q_s32(input);
+ in7->val[1] = vld1q_s32(input + 4);
+}
+
+static INLINE void load_4x8_s32_dual(const tran_low_t *input,
+ int32x4_t *const in0, int32x4_t *const in1,
+ int32x4_t *const in2, int32x4_t *const in3,
+ int32x4_t *const in4, int32x4_t *const in5,
+ int32x4_t *const in6,
+ int32x4_t *const in7) {
+ *in0 = vld1q_s32(input);
+ input += 32;
+ *in1 = vld1q_s32(input);
+ input += 32;
+ *in2 = vld1q_s32(input);
+ input += 32;
+ *in3 = vld1q_s32(input);
+ input += 32;
+ *in4 = vld1q_s32(input);
+ input += 32;
+ *in5 = vld1q_s32(input);
+ input += 32;
+ *in6 = vld1q_s32(input);
+ input += 32;
+ *in7 = vld1q_s32(input);
+}
+
+// Only for the first pass of the _135_ variant. Since it only uses values from
+// the top left 16x16 it can safely assume all the remaining values are 0 and
+// skip an awful lot of calculations. In fact, only the first 12 columns make
+// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
+// used so it skips any calls to input[12|13|14|15] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 12x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
+// coefficients as follows:
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// 0 0 2 5 10 17 25 38 47 62 83 101 121
+// 1 1 4 8 15 22 30 45 58 74 92 112 133
+// 2 3 7 12 18 28 36 52 64 82 102 118
+// 3 6 11 16 23 31 43 60 73 90 109 126
+// 4 9 14 19 29 37 50 65 78 98 116 134
+// 5 13 20 26 35 44 54 72 85 105 123
+// 6 21 27 33 42 53 63 80 94 113 132
+// 7 24 32 39 48 57 71 88 104 120
+// 8 34 40 46 56 68 81 96 111 130
+// 9 41 49 55 67 77 91 107 124
+// 10 51 59 66 76 89 99 119 131
+// 11 61 69 75 87 100 114 129
+// 12 70 79 86 97 108 122
+// 13 84 93 103 110 125
+// 14 98 106 115 127
+// 15 117 128
+static void vpx_highbd_idct32_12_neon(const tran_low_t *const input,
+ int32_t *output) {
+ int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+ s8[32];
+
+ load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5],
+ &in[6], &in[7]);
+ transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0],
+ &in[9].val[1], &in[10].val[0], &in[10].val[1],
+ &in[11].val[0], &in[11].val[1]);
+ transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1],
+ &in[10].val[0], &in[10].val[1], &in[11].val[0],
+ &in[11].val[1]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+ s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
+ s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
+
+ s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+ s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
+ s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
+
+ s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+ s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
+ s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
+
+ s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+ s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
+ s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
+ s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
+ s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
+ s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
+ s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
+ s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
+ s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
+
+ // stage 3
+ s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+ s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+ s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
+ s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
+ s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
+ s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
+
+ s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+ s1[31], cospi_28_64);
+ s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+ s1[31], cospi_4_64);
+
+ s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
+ s2[29], -cospi_4_64);
+ s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
+ s2[29], cospi_28_64);
+
+ s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
+ s2[26], cospi_12_64);
+ s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
+ s2[26], cospi_20_64);
+
+ s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+ s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
+ s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
+
+ s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+ s2[15], cospi_24_64);
+ s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+ s2[15], cospi_8_64);
+
+ s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
+ s3[13], -cospi_8_64);
+ s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
+ s3[13], cospi_24_64);
+
+ s4[16] = highbd_idct_add_dual(s1[16], s2[19]);
+ s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
+ s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
+ s4[19] = highbd_idct_sub_dual(s1[16], s2[19]);
+ s4[20] = highbd_idct_sub_dual(s1[23], s2[20]);
+ s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
+ s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
+ s4[23] = highbd_idct_add_dual(s2[20], s1[23]);
+ s4[24] = highbd_idct_add_dual(s1[24], s2[27]);
+ s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
+ s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
+ s4[27] = highbd_idct_sub_dual(s1[24], s2[27]);
+ s4[28] = highbd_idct_sub_dual(s1[31], s2[28]);
+ s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
+ s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
+ s4[31] = highbd_idct_add_dual(s2[28], s1[31]);
+
+ // stage 5
+ s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
+ s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
+ s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
+ s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
+
+ s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64);
+ s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64);
+
+ s5[8] = highbd_idct_add_dual(s2[8], s3[11]);
+ s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
+ s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
+ s5[11] = highbd_idct_sub_dual(s2[8], s3[11]);
+ s5[12] = highbd_idct_sub_dual(s2[15], s3[12]);
+ s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
+ s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
+ s5[15] = highbd_idct_add_dual(s2[15], s3[12]);
+
+ s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
+ s4[29], cospi_24_64);
+ s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
+ s4[29], cospi_8_64);
+
+ s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
+ s4[28], cospi_24_64);
+ s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
+ s4[28], cospi_8_64);
+
+ s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
+ s4[27], -cospi_8_64);
+ s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
+ s4[27], cospi_24_64);
+
+ s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
+ s4[26], -cospi_8_64);
+ s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
+ s4[26], cospi_24_64);
+
+ // stage 6
+ s6[0] = highbd_idct_add_dual(s5[0], s3[7]);
+ s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
+ s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
+ s6[3] = highbd_idct_add_dual(s5[3], s3[4]);
+ s6[4] = highbd_idct_sub_dual(s5[3], s3[4]);
+ s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
+ s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
+ s6[7] = highbd_idct_sub_dual(s5[0], s3[7]);
+
+ s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
+ s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
+
+ s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
+ s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
+
+ s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
+ s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
+ s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
+ s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
+ s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
+ s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
+ s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
+ s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
+
+ s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
+ s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
+ s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
+ s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
+ s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
+ s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
+ s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
+ s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
+
+ // stage 7
+ s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
+ s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
+ s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
+ s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
+ s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
+ s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
+ s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
+ s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
+ s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
+ s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
+ s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
+ s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
+ s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
+ s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
+ s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
+ s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
+
+ s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
+ s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
+
+ s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
+ s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
+
+ s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
+ s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
+
+ s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
+ s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
+
+ // final stage
+ s8[0] = highbd_idct_add_dual(s7[0], s6[31]);
+ s8[1] = highbd_idct_add_dual(s7[1], s6[30]);
+ s8[2] = highbd_idct_add_dual(s7[2], s6[29]);
+ s8[3] = highbd_idct_add_dual(s7[3], s6[28]);
+ s8[4] = highbd_idct_add_dual(s7[4], s7[27]);
+ s8[5] = highbd_idct_add_dual(s7[5], s7[26]);
+ s8[6] = highbd_idct_add_dual(s7[6], s7[25]);
+ s8[7] = highbd_idct_add_dual(s7[7], s7[24]);
+ s8[8] = highbd_idct_add_dual(s7[8], s7[23]);
+ s8[9] = highbd_idct_add_dual(s7[9], s7[22]);
+ s8[10] = highbd_idct_add_dual(s7[10], s7[21]);
+ s8[11] = highbd_idct_add_dual(s7[11], s7[20]);
+ s8[12] = highbd_idct_add_dual(s7[12], s6[19]);
+ s8[13] = highbd_idct_add_dual(s7[13], s6[18]);
+ s8[14] = highbd_idct_add_dual(s7[14], s6[17]);
+ s8[15] = highbd_idct_add_dual(s7[15], s6[16]);
+ s8[16] = highbd_idct_sub_dual(s7[15], s6[16]);
+ s8[17] = highbd_idct_sub_dual(s7[14], s6[17]);
+ s8[18] = highbd_idct_sub_dual(s7[13], s6[18]);
+ s8[19] = highbd_idct_sub_dual(s7[12], s6[19]);
+ s8[20] = highbd_idct_sub_dual(s7[11], s7[20]);
+ s8[21] = highbd_idct_sub_dual(s7[10], s7[21]);
+ s8[22] = highbd_idct_sub_dual(s7[9], s7[22]);
+ s8[23] = highbd_idct_sub_dual(s7[8], s7[23]);
+ s8[24] = highbd_idct_sub_dual(s7[7], s7[24]);
+ s8[25] = highbd_idct_sub_dual(s7[6], s7[25]);
+ s8[26] = highbd_idct_sub_dual(s7[5], s7[26]);
+ s8[27] = highbd_idct_sub_dual(s7[4], s7[27]);
+ s8[28] = highbd_idct_sub_dual(s7[3], s6[28]);
+ s8[29] = highbd_idct_sub_dual(s7[2], s6[29]);
+ s8[30] = highbd_idct_sub_dual(s7[1], s6[30]);
+ s8[31] = highbd_idct_sub_dual(s7[0], s6[31]);
+
+ vst1q_s32(output + 0, s8[0].val[0]);
+ vst1q_s32(output + 4, s8[0].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[1].val[0]);
+ vst1q_s32(output + 4, s8[1].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[2].val[0]);
+ vst1q_s32(output + 4, s8[2].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[3].val[0]);
+ vst1q_s32(output + 4, s8[3].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[4].val[0]);
+ vst1q_s32(output + 4, s8[4].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[5].val[0]);
+ vst1q_s32(output + 4, s8[5].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[6].val[0]);
+ vst1q_s32(output + 4, s8[6].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[7].val[0]);
+ vst1q_s32(output + 4, s8[7].val[1]);
+ output += 16;
+
+ vst1q_s32(output + 0, s8[8].val[0]);
+ vst1q_s32(output + 4, s8[8].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[9].val[0]);
+ vst1q_s32(output + 4, s8[9].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[10].val[0]);
+ vst1q_s32(output + 4, s8[10].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[11].val[0]);
+ vst1q_s32(output + 4, s8[11].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[12].val[0]);
+ vst1q_s32(output + 4, s8[12].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[13].val[0]);
+ vst1q_s32(output + 4, s8[13].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[14].val[0]);
+ vst1q_s32(output + 4, s8[14].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[15].val[0]);
+ vst1q_s32(output + 4, s8[15].val[1]);
+ output += 16;
+
+ vst1q_s32(output + 0, s8[16].val[0]);
+ vst1q_s32(output + 4, s8[16].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[17].val[0]);
+ vst1q_s32(output + 4, s8[17].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[18].val[0]);
+ vst1q_s32(output + 4, s8[18].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[19].val[0]);
+ vst1q_s32(output + 4, s8[19].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[20].val[0]);
+ vst1q_s32(output + 4, s8[20].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[21].val[0]);
+ vst1q_s32(output + 4, s8[21].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[22].val[0]);
+ vst1q_s32(output + 4, s8[22].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[23].val[0]);
+ vst1q_s32(output + 4, s8[23].val[1]);
+ output += 16;
+
+ vst1q_s32(output + 0, s8[24].val[0]);
+ vst1q_s32(output + 4, s8[24].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[25].val[0]);
+ vst1q_s32(output + 4, s8[25].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[26].val[0]);
+ vst1q_s32(output + 4, s8[26].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[27].val[0]);
+ vst1q_s32(output + 4, s8[27].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[28].val[0]);
+ vst1q_s32(output + 4, s8[28].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[29].val[0]);
+ vst1q_s32(output + 4, s8[29].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[30].val[0]);
+ vst1q_s32(output + 4, s8[30].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[31].val[0]);
+ vst1q_s32(output + 4, s8[31].val[1]);
+}
+
+static void vpx_highbd_idct32_16_neon(const int32_t *const input,
+ uint16_t *const output, const int stride,
+ const int bd) {
+ int32x4x2_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+ out[32];
+
+ load_and_transpose_s32_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4],
+ &in[5], &in[6], &in[7]);
+
+ load_and_transpose_s32_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11],
+ &in[12], &in[13], &in[14], &in[15]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+ s1[17] = multiply_shift_and_narrow_s32_dual(in[15], -cospi_17_64);
+ s1[30] = multiply_shift_and_narrow_s32_dual(in[15], cospi_15_64);
+
+ s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
+ s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
+
+ s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+ s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
+ s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
+
+ s1[22] = multiply_shift_and_narrow_s32_dual(in[13], cospi_19_64);
+ s1[25] = multiply_shift_and_narrow_s32_dual(in[13], cospi_13_64);
+
+ s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+ s2[9] = multiply_shift_and_narrow_s32_dual(in[14], -cospi_18_64);
+ s2[14] = multiply_shift_and_narrow_s32_dual(in[14], cospi_14_64);
+
+ s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
+ s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
+
+ s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+ s2[16] = highbd_idct_add_dual(s1[16], s1[17]);
+ s2[17] = highbd_idct_sub_dual(s1[16], s1[17]);
+ s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
+ s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
+ s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
+ s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
+ s2[22] = highbd_idct_sub_dual(s1[23], s1[22]);
+ s2[23] = highbd_idct_add_dual(s1[22], s1[23]);
+ s2[24] = highbd_idct_add_dual(s1[24], s1[25]);
+ s2[25] = highbd_idct_sub_dual(s1[24], s1[25]);
+ s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
+ s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
+ s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
+ s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
+ s2[30] = highbd_idct_sub_dual(s1[31], s1[30]);
+ s2[31] = highbd_idct_add_dual(s1[30], s1[31]);
+
+ // stage 3
+ s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+ s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+ s3[5] = multiply_shift_and_narrow_s32_dual(in[12], -cospi_20_64);
+ s3[6] = multiply_shift_and_narrow_s32_dual(in[12], cospi_12_64);
+
+ s3[8] = highbd_idct_add_dual(s2[8], s2[9]);
+ s3[9] = highbd_idct_sub_dual(s2[8], s2[9]);
+ s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
+ s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
+ s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
+ s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
+ s3[14] = highbd_idct_sub_dual(s2[15], s2[14]);
+ s3[15] = highbd_idct_add_dual(s2[14], s2[15]);
+
+ s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], -cospi_4_64,
+ s2[30], cospi_28_64);
+ s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], cospi_28_64,
+ s2[30], cospi_4_64);
+
+ s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
+ s2[29], -cospi_4_64);
+ s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
+ s2[29], cospi_28_64);
+
+ s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
+ s2[26], cospi_12_64);
+ s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
+ s2[26], cospi_20_64);
+
+ s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_12_64,
+ s2[25], -cospi_20_64);
+ s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_20_64,
+ s2[25], cospi_12_64);
+
+ // stage 4
+ s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+ s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
+ s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
+
+ s4[4] = highbd_idct_add_dual(s3[4], s3[5]);
+ s4[5] = highbd_idct_sub_dual(s3[4], s3[5]);
+ s4[6] = highbd_idct_sub_dual(s3[7], s3[6]);
+ s4[7] = highbd_idct_add_dual(s3[6], s3[7]);
+
+ s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], -cospi_8_64,
+ s3[14], cospi_24_64);
+ s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], cospi_24_64,
+ s3[14], cospi_8_64);
+
+ s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
+ s3[13], -cospi_8_64);
+ s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
+ s3[13], cospi_24_64);
+
+ s4[16] = highbd_idct_add_dual(s2[16], s2[19]);
+ s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
+ s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
+ s4[19] = highbd_idct_sub_dual(s2[16], s2[19]);
+ s4[20] = highbd_idct_sub_dual(s2[23], s2[20]);
+ s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
+ s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
+ s4[23] = highbd_idct_add_dual(s2[20], s2[23]);
+ s4[24] = highbd_idct_add_dual(s2[24], s2[27]);
+ s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
+ s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
+ s4[27] = highbd_idct_sub_dual(s2[24], s2[27]);
+ s4[28] = highbd_idct_sub_dual(s2[31], s2[28]);
+ s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
+ s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
+ s4[31] = highbd_idct_add_dual(s2[28], s2[31]);
+
+ // stage 5
+ s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
+ s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
+ s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
+ s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
+
+ s5[5] = sub_multiply_shift_and_narrow_s32_dual(s4[6], s4[5], cospi_16_64);
+ s5[6] = add_multiply_shift_and_narrow_s32_dual(s4[5], s4[6], cospi_16_64);
+
+ s5[8] = highbd_idct_add_dual(s3[8], s3[11]);
+ s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
+ s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
+ s5[11] = highbd_idct_sub_dual(s3[8], s3[11]);
+ s5[12] = highbd_idct_sub_dual(s3[15], s3[12]);
+ s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
+ s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
+ s5[15] = highbd_idct_add_dual(s3[15], s3[12]);
+
+ s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
+ s4[29], cospi_24_64);
+ s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
+ s4[29], cospi_8_64);
+
+ s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
+ s4[28], cospi_24_64);
+ s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
+ s4[28], cospi_8_64);
+
+ s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
+ s4[27], -cospi_8_64);
+ s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
+ s4[27], cospi_24_64);
+
+ s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
+ s4[26], -cospi_8_64);
+ s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
+ s4[26], cospi_24_64);
+
+ // stage 6
+ s6[0] = highbd_idct_add_dual(s5[0], s4[7]);
+ s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
+ s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
+ s6[3] = highbd_idct_add_dual(s5[3], s4[4]);
+ s6[4] = highbd_idct_sub_dual(s5[3], s4[4]);
+ s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
+ s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
+ s6[7] = highbd_idct_sub_dual(s5[0], s4[7]);
+
+ s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
+ s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
+
+ s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
+ s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
+
+ s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
+ s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
+ s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
+ s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
+ s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
+ s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
+ s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
+ s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
+ s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
+ s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
+ s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
+ s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
+ s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
+ s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
+ s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
+ s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
+
+ // stage 7
+ s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
+ s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
+ s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
+ s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
+ s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
+ s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
+ s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
+ s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
+ s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
+ s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
+ s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
+ s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
+ s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
+ s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
+ s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
+ s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
+
+ s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
+ s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
+
+ s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
+ s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
+
+ s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
+ s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
+
+ s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
+ s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
+
+ // final stage
+ out[0] = highbd_idct_add_dual(s7[0], s6[31]);
+ out[1] = highbd_idct_add_dual(s7[1], s6[30]);
+ out[2] = highbd_idct_add_dual(s7[2], s6[29]);
+ out[3] = highbd_idct_add_dual(s7[3], s6[28]);
+ out[4] = highbd_idct_add_dual(s7[4], s7[27]);
+ out[5] = highbd_idct_add_dual(s7[5], s7[26]);
+ out[6] = highbd_idct_add_dual(s7[6], s7[25]);
+ out[7] = highbd_idct_add_dual(s7[7], s7[24]);
+ out[8] = highbd_idct_add_dual(s7[8], s7[23]);
+ out[9] = highbd_idct_add_dual(s7[9], s7[22]);
+ out[10] = highbd_idct_add_dual(s7[10], s7[21]);
+ out[11] = highbd_idct_add_dual(s7[11], s7[20]);
+ out[12] = highbd_idct_add_dual(s7[12], s6[19]);
+ out[13] = highbd_idct_add_dual(s7[13], s6[18]);
+ out[14] = highbd_idct_add_dual(s7[14], s6[17]);
+ out[15] = highbd_idct_add_dual(s7[15], s6[16]);
+ out[16] = highbd_idct_sub_dual(s7[15], s6[16]);
+ out[17] = highbd_idct_sub_dual(s7[14], s6[17]);
+ out[18] = highbd_idct_sub_dual(s7[13], s6[18]);
+ out[19] = highbd_idct_sub_dual(s7[12], s6[19]);
+ out[20] = highbd_idct_sub_dual(s7[11], s7[20]);
+ out[21] = highbd_idct_sub_dual(s7[10], s7[21]);
+ out[22] = highbd_idct_sub_dual(s7[9], s7[22]);
+ out[23] = highbd_idct_sub_dual(s7[8], s7[23]);
+ out[24] = highbd_idct_sub_dual(s7[7], s7[24]);
+ out[25] = highbd_idct_sub_dual(s7[6], s7[25]);
+ out[26] = highbd_idct_sub_dual(s7[5], s7[26]);
+ out[27] = highbd_idct_sub_dual(s7[4], s7[27]);
+ out[28] = highbd_idct_sub_dual(s7[3], s6[28]);
+ out[29] = highbd_idct_sub_dual(s7[2], s6[29]);
+ out[30] = highbd_idct_sub_dual(s7[1], s6[30]);
+ out[31] = highbd_idct_sub_dual(s7[0], s6[31]);
+
+ highbd_idct16x16_add_store(out, output, stride, bd);
+ highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
+}
+
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+
+ if (bd == 8) {
+ int16_t temp[32 * 16];
+ int16_t *t = temp;
+ vpx_idct32_12_neon(input, temp);
+ vpx_idct32_12_neon(input + 32 * 8, temp + 8);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_idct32_16_neon(t, dest, stride, 1);
+ t += (16 * 8);
+ dest += 8;
+ }
+ } else {
+ int32_t temp[32 * 16];
+ int32_t *t = temp;
+ vpx_highbd_idct32_12_neon(input, temp);
+ vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_highbd_idct32_16_neon(t, dest, stride, bd);
+ t += (16 * 8);
+ dest += 8;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
new file mode 100644
index 0000000000..f05932cec3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the _34_ variant. Since it only uses values from
+// the top left 8x8 it can safely assume all the remaining values are 0 and skip
+// an awful lot of calculations. In fact, only the first 6 columns make the cut.
+// None of the elements in the 7th or 8th column are used so it skips any calls
+// to input[67] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 8x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
+// coefficients as follows:
+// 0 1 2 3 4 5 6 7
+// 0 0 2 5 10 17 25
+// 1 1 4 8 15 22 30
+// 2 3 7 12 18 28
+// 3 6 11 16 23 31
+// 4 9 14 19 29
+// 5 13 20 26
+// 6 21 27 33
+// 7 24 32
+static void vpx_highbd_idct32_6_neon(const tran_low_t *input, int32_t *output) {
+ int32x4x2_t in[8], s1[32], s2[32], s3[32];
+
+ in[0].val[0] = vld1q_s32(input);
+ in[0].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[1].val[0] = vld1q_s32(input);
+ in[1].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[2].val[0] = vld1q_s32(input);
+ in[2].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[3].val[0] = vld1q_s32(input);
+ in[3].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[4].val[0] = vld1q_s32(input);
+ in[4].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[5].val[0] = vld1q_s32(input);
+ in[5].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[6].val[0] = vld1q_s32(input);
+ in[6].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[7].val[0] = vld1q_s32(input);
+ in[7].val[1] = vld1q_s32(input + 4);
+ transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ // stage 1
+ // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
+ s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+ // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
+ s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+ s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+ s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+ // stage 3
+ s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+ s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+ s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+ s1[31], cospi_28_64);
+ s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+ s1[31], cospi_4_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
+ s1[27], cospi_12_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
+ s1[27], cospi_20_64);
+
+ s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+
+ s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+ s2[15], cospi_24_64);
+ s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+ s2[15], cospi_8_64);
+
+ s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
+ s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
+ s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
+ s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
+ s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
+ s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
+ s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
+ s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
+
+ // stage 5
+ s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
+ s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
+
+ s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], -cospi_8_64,
+ s1[30], cospi_24_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], cospi_24_64,
+ s1[30], cospi_8_64);
+
+ s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_8_64,
+ s1[31], cospi_24_64);
+ s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_24_64,
+ s1[31], cospi_8_64);
+
+ s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
+ s2[27], -cospi_8_64);
+ s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
+ s2[27], cospi_24_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
+ s2[26], -cospi_8_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
+ s2[26], cospi_24_64);
+
+ // stage 6
+ s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
+ s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
+ s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
+ s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
+ s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
+ s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
+ s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
+ s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
+
+ s2[10] = sub_multiply_shift_and_narrow_s32_dual(s2[14], s2[9], cospi_16_64);
+ s2[13] = add_multiply_shift_and_narrow_s32_dual(s2[9], s2[14], cospi_16_64);
+
+ s2[11] = sub_multiply_shift_and_narrow_s32_dual(s2[15], s2[8], cospi_16_64);
+ s2[12] = add_multiply_shift_and_narrow_s32_dual(s2[8], s2[15], cospi_16_64);
+
+ s2[16] = highbd_idct_add_dual(s1[16], s2[23]);
+ s2[17] = highbd_idct_add_dual(s1[17], s2[22]);
+ s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
+ s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
+ s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
+ s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
+ s2[22] = highbd_idct_sub_dual(s1[17], s2[22]);
+ s2[23] = highbd_idct_sub_dual(s1[16], s2[23]);
+
+ s3[24] = highbd_idct_sub_dual(s1[31], s2[24]);
+ s3[25] = highbd_idct_sub_dual(s1[30], s2[25]);
+ s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
+ s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
+ s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
+ s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
+ s2[30] = highbd_idct_add_dual(s2[25], s1[30]);
+ s2[31] = highbd_idct_add_dual(s2[24], s1[31]);
+
+ // stage 7
+ s1[0] = highbd_idct_add_dual(s2[0], s2[15]);
+ s1[1] = highbd_idct_add_dual(s2[1], s2[14]);
+ s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
+ s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
+ s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
+ s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
+ s1[6] = highbd_idct_add_dual(s2[6], s2[9]);
+ s1[7] = highbd_idct_add_dual(s2[7], s2[8]);
+ s1[8] = highbd_idct_sub_dual(s2[7], s2[8]);
+ s1[9] = highbd_idct_sub_dual(s2[6], s2[9]);
+ s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
+ s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
+ s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
+ s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
+ s1[14] = highbd_idct_sub_dual(s2[1], s2[14]);
+ s1[15] = highbd_idct_sub_dual(s2[0], s2[15]);
+
+ s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
+ s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
+
+ s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
+ s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
+
+ s1[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s2[22], cospi_16_64);
+ s1[25] = add_multiply_shift_and_narrow_s32_dual(s2[22], s3[25], cospi_16_64);
+
+ s1[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s2[23], cospi_16_64);
+ s1[24] = add_multiply_shift_and_narrow_s32_dual(s2[23], s3[24], cospi_16_64);
+
+ // final stage
+ s3[0] = highbd_idct_add_dual(s1[0], s2[31]);
+ s3[1] = highbd_idct_add_dual(s1[1], s2[30]);
+ s3[2] = highbd_idct_add_dual(s1[2], s2[29]);
+ s3[3] = highbd_idct_add_dual(s1[3], s2[28]);
+ s3[4] = highbd_idct_add_dual(s1[4], s1[27]);
+ s3[5] = highbd_idct_add_dual(s1[5], s1[26]);
+ s3[6] = highbd_idct_add_dual(s1[6], s1[25]);
+ s3[7] = highbd_idct_add_dual(s1[7], s1[24]);
+ s3[8] = highbd_idct_add_dual(s1[8], s1[23]);
+ s3[9] = highbd_idct_add_dual(s1[9], s1[22]);
+ s3[10] = highbd_idct_add_dual(s1[10], s1[21]);
+ s3[11] = highbd_idct_add_dual(s1[11], s1[20]);
+ s3[12] = highbd_idct_add_dual(s1[12], s2[19]);
+ s3[13] = highbd_idct_add_dual(s1[13], s2[18]);
+ s3[14] = highbd_idct_add_dual(s1[14], s2[17]);
+ s3[15] = highbd_idct_add_dual(s1[15], s2[16]);
+ s3[16] = highbd_idct_sub_dual(s1[15], s2[16]);
+ s3[17] = highbd_idct_sub_dual(s1[14], s2[17]);
+ s3[18] = highbd_idct_sub_dual(s1[13], s2[18]);
+ s3[19] = highbd_idct_sub_dual(s1[12], s2[19]);
+ s3[20] = highbd_idct_sub_dual(s1[11], s1[20]);
+ s3[21] = highbd_idct_sub_dual(s1[10], s1[21]);
+ s3[22] = highbd_idct_sub_dual(s1[9], s1[22]);
+ s3[23] = highbd_idct_sub_dual(s1[8], s1[23]);
+ s3[24] = highbd_idct_sub_dual(s1[7], s1[24]);
+ s3[25] = highbd_idct_sub_dual(s1[6], s1[25]);
+ s3[26] = highbd_idct_sub_dual(s1[5], s1[26]);
+ s3[27] = highbd_idct_sub_dual(s1[4], s1[27]);
+ s3[28] = highbd_idct_sub_dual(s1[3], s2[28]);
+ s3[29] = highbd_idct_sub_dual(s1[2], s2[29]);
+ s3[30] = highbd_idct_sub_dual(s1[1], s2[30]);
+ s3[31] = highbd_idct_sub_dual(s1[0], s2[31]);
+
+ vst1q_s32(output, s3[0].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[0].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[1].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[1].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[2].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[2].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[3].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[3].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[4].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[4].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[5].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[5].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[6].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[6].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[7].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[7].val[1]);
+ output += 4;
+
+ vst1q_s32(output, s3[8].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[8].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[9].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[9].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[10].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[10].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[11].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[11].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[12].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[12].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[13].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[13].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[14].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[14].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[15].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[15].val[1]);
+ output += 4;
+
+ vst1q_s32(output, s3[16].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[16].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[17].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[17].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[18].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[18].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[19].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[19].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[20].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[20].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[21].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[21].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[22].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[22].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[23].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[23].val[1]);
+ output += 4;
+
+ vst1q_s32(output, s3[24].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[24].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[25].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[25].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[26].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[26].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[27].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[27].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[28].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[28].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[29].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[29].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[30].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[30].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[31].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[31].val[1]);
+}
+
+static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output,
+ int stride, const int bd) {
+ int32x4x2_t in[8], s1[32], s2[32], s3[32], out[32];
+
+ load_and_transpose_s32_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
+ &in[5], &in[6], &in[7]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+ // Different for _8_
+ s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+ s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+ s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+ // stage 3
+ s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+ s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+ s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+ s1[31], cospi_28_64);
+ s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+ s1[31], cospi_4_64);
+
+ // Different for _8_
+ s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_28_64,
+ s1[28], -cospi_4_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_4_64,
+ s1[28], cospi_28_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
+ s1[27], cospi_12_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
+ s1[27], cospi_20_64);
+
+ s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+
+ s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+ s2[15], cospi_24_64);
+ s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+ s2[15], cospi_8_64);
+
+ s2[10] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_24_64,
+ s2[12], -cospi_8_64);
+ s2[13] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_8_64,
+ s2[12], cospi_24_64);
+
+ s2[16] = highbd_idct_add_dual(s1[16], s1[19]);
+
+ s2[17] = highbd_idct_add_dual(s1[17], s1[18]);
+ s2[18] = highbd_idct_sub_dual(s1[17], s1[18]);
+
+ s2[19] = highbd_idct_sub_dual(s1[16], s1[19]);
+
+ s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
+ s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
+
+ s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
+ s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
+
+ s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
+ s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
+ s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
+ s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
+
+ s2[28] = highbd_idct_sub_dual(s1[31], s1[28]);
+ s2[29] = highbd_idct_sub_dual(s1[30], s1[29]);
+ s2[30] = highbd_idct_add_dual(s1[29], s1[30]);
+ s2[31] = highbd_idct_add_dual(s1[28], s1[31]);
+
+ // stage 5
+ s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
+ s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
+
+ s1[8] = highbd_idct_add_dual(s2[8], s2[11]);
+ s1[9] = highbd_idct_add_dual(s2[9], s2[10]);
+ s1[10] = highbd_idct_sub_dual(s2[9], s2[10]);
+ s1[11] = highbd_idct_sub_dual(s2[8], s2[11]);
+ s1[12] = highbd_idct_sub_dual(s2[15], s2[12]);
+ s1[13] = highbd_idct_sub_dual(s2[14], s2[13]);
+ s1[14] = highbd_idct_add_dual(s2[13], s2[14]);
+ s1[15] = highbd_idct_add_dual(s2[12], s2[15]);
+
+ s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_8_64,
+ s2[29], cospi_24_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], cospi_24_64,
+ s2[29], cospi_8_64);
+
+ s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], -cospi_8_64,
+ s2[28], cospi_24_64);
+ s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], cospi_24_64,
+ s2[28], cospi_8_64);
+
+ s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
+ s2[27], -cospi_8_64);
+ s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
+ s2[27], cospi_24_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
+ s2[26], -cospi_8_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
+ s2[26], cospi_24_64);
+
+ // stage 6
+ s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
+ s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
+ s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
+ s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
+ s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
+ s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
+ s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
+ s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
+
+ s2[10] = sub_multiply_shift_and_narrow_s32_dual(s1[13], s1[10], cospi_16_64);
+ s2[13] = add_multiply_shift_and_narrow_s32_dual(s1[10], s1[13], cospi_16_64);
+
+ s2[11] = sub_multiply_shift_and_narrow_s32_dual(s1[12], s1[11], cospi_16_64);
+ s2[12] = add_multiply_shift_and_narrow_s32_dual(s1[11], s1[12], cospi_16_64);
+
+ s1[16] = highbd_idct_add_dual(s2[16], s2[23]);
+ s1[17] = highbd_idct_add_dual(s2[17], s2[22]);
+ s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
+ s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
+ s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
+ s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
+ s1[22] = highbd_idct_sub_dual(s2[17], s2[22]);
+ s1[23] = highbd_idct_sub_dual(s2[16], s2[23]);
+
+ s3[24] = highbd_idct_sub_dual(s2[31], s2[24]);
+ s3[25] = highbd_idct_sub_dual(s2[30], s2[25]);
+ s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
+ s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
+ s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
+ s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
+ s2[30] = highbd_idct_add_dual(s2[25], s2[30]);
+ s2[31] = highbd_idct_add_dual(s2[24], s2[31]);
+
+ // stage 7
+ s1[0] = highbd_idct_add_dual(s2[0], s1[15]);
+ s1[1] = highbd_idct_add_dual(s2[1], s1[14]);
+ s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
+ s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
+ s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
+ s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
+ s1[6] = highbd_idct_add_dual(s2[6], s1[9]);
+ s1[7] = highbd_idct_add_dual(s2[7], s1[8]);
+ s1[8] = highbd_idct_sub_dual(s2[7], s1[8]);
+ s1[9] = highbd_idct_sub_dual(s2[6], s1[9]);
+ s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
+ s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
+ s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
+ s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
+ s1[14] = highbd_idct_sub_dual(s2[1], s1[14]);
+ s1[15] = highbd_idct_sub_dual(s2[0], s1[15]);
+
+ s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
+ s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
+
+ s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
+ s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
+
+ s2[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s1[22], cospi_16_64);
+ s1[25] = add_multiply_shift_and_narrow_s32_dual(s1[22], s3[25], cospi_16_64);
+
+ s2[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s1[23], cospi_16_64);
+ s1[24] = add_multiply_shift_and_narrow_s32_dual(s1[23], s3[24], cospi_16_64);
+
+ // final stage
+ out[0] = highbd_idct_add_dual(s1[0], s2[31]);
+ out[1] = highbd_idct_add_dual(s1[1], s2[30]);
+ out[2] = highbd_idct_add_dual(s1[2], s2[29]);
+ out[3] = highbd_idct_add_dual(s1[3], s2[28]);
+ out[4] = highbd_idct_add_dual(s1[4], s1[27]);
+ out[5] = highbd_idct_add_dual(s1[5], s1[26]);
+ out[6] = highbd_idct_add_dual(s1[6], s1[25]);
+ out[7] = highbd_idct_add_dual(s1[7], s1[24]);
+ out[8] = highbd_idct_add_dual(s1[8], s2[23]);
+ out[9] = highbd_idct_add_dual(s1[9], s2[22]);
+ out[10] = highbd_idct_add_dual(s1[10], s1[21]);
+ out[11] = highbd_idct_add_dual(s1[11], s1[20]);
+ out[12] = highbd_idct_add_dual(s1[12], s2[19]);
+ out[13] = highbd_idct_add_dual(s1[13], s2[18]);
+ out[14] = highbd_idct_add_dual(s1[14], s1[17]);
+ out[15] = highbd_idct_add_dual(s1[15], s1[16]);
+ out[16] = highbd_idct_sub_dual(s1[15], s1[16]);
+ out[17] = highbd_idct_sub_dual(s1[14], s1[17]);
+ out[18] = highbd_idct_sub_dual(s1[13], s2[18]);
+ out[19] = highbd_idct_sub_dual(s1[12], s2[19]);
+ out[20] = highbd_idct_sub_dual(s1[11], s1[20]);
+ out[21] = highbd_idct_sub_dual(s1[10], s1[21]);
+ out[22] = highbd_idct_sub_dual(s1[9], s2[22]);
+ out[23] = highbd_idct_sub_dual(s1[8], s2[23]);
+ out[24] = highbd_idct_sub_dual(s1[7], s1[24]);
+ out[25] = highbd_idct_sub_dual(s1[6], s1[25]);
+ out[26] = highbd_idct_sub_dual(s1[5], s1[26]);
+ out[27] = highbd_idct_sub_dual(s1[4], s1[27]);
+ out[28] = highbd_idct_sub_dual(s1[3], s2[28]);
+ out[29] = highbd_idct_sub_dual(s1[2], s2[29]);
+ out[30] = highbd_idct_sub_dual(s1[1], s2[30]);
+ out[31] = highbd_idct_sub_dual(s1[0], s2[31]);
+
+ highbd_idct16x16_add_store(out, output, stride, bd);
+ highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
+}
+
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+
+ if (bd == 8) {
+ int16_t temp[32 * 8];
+ int16_t *t = temp;
+
+ vpx_idct32_6_neon(input, t);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_idct32_8_neon(t, dest, stride, 1);
+ t += (8 * 8);
+ dest += 8;
+ }
+ } else {
+ int32_t temp[32 * 8];
+ int32_t *t = temp;
+
+ vpx_highbd_idct32_6_neon(input, t);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_highbd_idct32_8_neon(t, dest, stride, bd);
+ t += (8 * 8);
+ dest += 8;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
new file mode 100644
index 0000000000..c1354c0c1a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x8_t a0 = vld1q_u16(*dest);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const uint16x8_t a2 = vld1q_u16(*dest + 16);
+ const uint16x8_t a3 = vld1q_u16(*dest + 24);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+ const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+ const int16x8_t c0 = vminq_s16(b0, max);
+ const int16x8_t c1 = vminq_s16(b1, max);
+ const int16x8_t c2 = vminq_s16(b2, max);
+ const int16x8_t c3 = vminq_s16(b3, max);
+ vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
+ vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+ vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));
+ vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a0 = vld1q_u16(*dest);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const uint16x8_t a2 = vld1q_u16(*dest + 16);
+ const uint16x8_t a3 = vld1q_u16(*dest + 24);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+ const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+ const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+ const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+ const uint16x8_t c2 = vqshluq_n_s16(b2, 0);
+ const uint16x8_t c3 = vqshluq_n_s16(b3, 0);
+ vst1q_u16(*dest, c0);
+ vst1q_u16(*dest + 8, c1);
+ vst1q_u16(*dest + 16, c2);
+ vst1q_u16(*dest + 24, c3);
+ *dest += stride;
+}
+
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ int i;
+
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ for (i = 0; i < 8; ++i) {
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ }
+ } else {
+ for (i = 0; i < 8; ++i) {
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
new file mode 100644
index 0000000000..7be1dad1d3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+// res is in reverse row order
+static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x4_t a0 = vld1_u16(*dest);
+ const uint16x4_t a1 = vld1_u16(*dest + stride);
+ const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0));
+ // Note: In some profile tests, res is quite close to +/-32767.
+ // We use saturating addition.
+ const int16x8_t b = vqaddq_s16(res, a);
+ const int16x8_t c = vminq_s16(b, max);
+ const uint16x8_t d = vqshluq_n_s16(c, 0);
+ vst1_u16(*dest, vget_high_u16(d));
+ *dest += stride;
+ vst1_u16(*dest, vget_low_u16(d));
+ *dest += stride;
+}
+
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
+ const int16x8_t dc = vdupq_n_s16(a1);
+
+ highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
+ highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
+}
+
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ int16x8_t a[2];
+ int32x4_t c[4];
+
+ c[0] = vld1q_s32(input);
+ c[1] = vld1q_s32(input + 4);
+ c[2] = vld1q_s32(input + 8);
+ c[3] = vld1q_s32(input + 12);
+
+ if (bd == 8) {
+ // Rows
+ a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
+ a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
+ transpose_idct4x4_16_bd8(a);
+
+ // Columns
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+ transpose_idct4x4_16_bd8(a);
+ a[0] = vrshrq_n_s16(a[0], 4);
+ a[1] = vrshrq_n_s16(a[1], 4);
+ } else {
+ const int32x4_t cospis = vld1q_s32(kCospi32);
+
+ if (bd == 10) {
+ idct4x4_16_kernel_bd10(cospis, c);
+ idct4x4_16_kernel_bd10(cospis, c);
+ } else {
+ idct4x4_16_kernel_bd12(cospis, c);
+ idct4x4_16_kernel_bd12(cospis, c);
+ }
+ a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
+ a[1] = vcombine_s16(vqrshrn_n_s32(c[3], 4), vqrshrn_n_s32(c[2], 4));
+ }
+
+ highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
+ highbd_idct4x4_1_add_kernel2(&dest, stride, a[1], max);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
new file mode 100644
index 0000000000..bed3227ca7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x8_t a = vld1q_u16(*dest);
+ const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+ const int16x8_t c = vminq_s16(b, max);
+ vst1q_u16(*dest, vreinterpretq_u16_s16(c));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a = vld1q_u16(*dest);
+ const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+ const uint16x8_t c = vqshluq_n_s16(b, 0);
+ vst1q_u16(*dest, c);
+ *dest += stride;
+}
+
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
+ const int16x8_t dc = vdupq_n_s16(a1);
+
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ } else {
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ }
+}
+
+static INLINE void idct8x8_12_half1d_bd10(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x4_t step1[8], step2[8];
+
+ transpose_s32_4x4(io0, io1, io2, io3);
+
+ // stage 1
+ step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+ step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+ step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+ step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+ step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+ step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
+
+ // stage 2
+ step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+ step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+ step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+ step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+ step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+ step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[1], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[1], step2[3]);
+
+ step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+ step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_half1d_bd12(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x2_t input1l, input1h, input3l, input3h;
+ int32x2_t step1l[2], step1h[2];
+ int32x4_t step1[8], step2[8];
+ int64x2_t t64[8];
+ int32x2_t t32[8];
+
+ transpose_s32_4x4(io0, io1, io2, io3);
+
+ // stage 1
+ input1l = vget_low_s32(*io1);
+ input1h = vget_high_s32(*io1);
+ input3l = vget_low_s32(*io3);
+ input3h = vget_high_s32(*io3);
+ step1l[0] = vget_low_s32(*io0);
+ step1h[0] = vget_high_s32(*io0);
+ step1l[1] = vget_low_s32(*io2);
+ step1h[1] = vget_high_s32(*io2);
+
+ t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
+ t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
+ t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
+ t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
+ t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
+ t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
+ t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
+ t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+ step1[4] = vcombine_s32(t32[0], t32[1]);
+ step1[5] = vcombine_s32(t32[2], t32[3]);
+ step1[6] = vcombine_s32(t32[4], t32[5]);
+ step1[7] = vcombine_s32(t32[6], t32[7]);
+
+ // stage 2
+ t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+ t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+ t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+ t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+ t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+ step2[1] = vcombine_s32(t32[2], t32[3]);
+ step2[2] = vcombine_s32(t32[4], t32[5]);
+ step2[3] = vcombine_s32(t32[6], t32[7]);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[1], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[1], step2[3]);
+
+ t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[0] =
+ vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t64[2] =
+ vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ step1[5] = vcombine_s32(t32[0], t32[1]);
+ step1[6] = vcombine_s32(t32[2], t32[3]);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int32x4_t a[16];
+ int16x8_t c[8];
+
+ a[0] = vld1q_s32(input);
+ a[1] = vld1q_s32(input + 8);
+ a[2] = vld1q_s32(input + 16);
+ a[3] = vld1q_s32(input + 24);
+
+ if (bd == 8) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x8_t cospisd = vaddq_s16(cospis, cospis);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24
+ const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28
+ int16x4_t b[8];
+
+ b[0] = vmovn_s32(a[0]);
+ b[1] = vmovn_s32(a[1]);
+ b[2] = vmovn_s32(a[2]);
+ b[3] = vmovn_s32(a[3]);
+
+ idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, b);
+ idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b, c);
+ c[0] = vrshrq_n_s16(c[0], 5);
+ c[1] = vrshrq_n_s16(c[1], 5);
+ c[2] = vrshrq_n_s16(c[2], 5);
+ c[3] = vrshrq_n_s16(c[3], 5);
+ c[4] = vrshrq_n_s16(c[4], 5);
+ c[5] = vrshrq_n_s16(c[5], 5);
+ c[6] = vrshrq_n_s16(c[6], 5);
+ c[7] = vrshrq_n_s16(c[7], 5);
+ } else {
+ const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24
+ const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
+
+ if (bd == 10) {
+ idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5], &a[6], &a[7]);
+ idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[8], &a[9], &a[10], &a[11]);
+ idct8x8_12_half1d_bd10(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
+ &a[12], &a[13], &a[14], &a[15]);
+ } else {
+ idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5], &a[6], &a[7]);
+ idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[8], &a[9], &a[10], &a[11]);
+ idct8x8_12_half1d_bd12(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
+ &a[12], &a[13], &a[14], &a[15]);
+ }
+ c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+ c[1] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+ c[2] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+ c[3] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+ c[4] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+ c[5] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+ c[6] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+ c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
+ }
+ highbd_add8x8(c, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int32x4_t a[16];
+ int16x8_t c[8];
+
+ a[0] = vld1q_s32(input);
+ a[1] = vld1q_s32(input + 4);
+ a[2] = vld1q_s32(input + 8);
+ a[3] = vld1q_s32(input + 12);
+ a[4] = vld1q_s32(input + 16);
+ a[5] = vld1q_s32(input + 20);
+ a[6] = vld1q_s32(input + 24);
+ a[7] = vld1q_s32(input + 28);
+ a[8] = vld1q_s32(input + 32);
+ a[9] = vld1q_s32(input + 36);
+ a[10] = vld1q_s32(input + 40);
+ a[11] = vld1q_s32(input + 44);
+ a[12] = vld1q_s32(input + 48);
+ a[13] = vld1q_s32(input + 52);
+ a[14] = vld1q_s32(input + 56);
+ a[15] = vld1q_s32(input + 60);
+
+ if (bd == 8) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
+ int16x8_t b[8];
+
+ b[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1]));
+ b[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3]));
+ b[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5]));
+ b[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7]));
+ b[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9]));
+ b[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11]));
+ b[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13]));
+ b[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15]));
+
+ idct8x8_64_1d_bd8(cospis0, cospis1, b);
+ idct8x8_64_1d_bd8(cospis0, cospis1, b);
+
+ c[0] = vrshrq_n_s16(b[0], 5);
+ c[1] = vrshrq_n_s16(b[1], 5);
+ c[2] = vrshrq_n_s16(b[2], 5);
+ c[3] = vrshrq_n_s16(b[3], 5);
+ c[4] = vrshrq_n_s16(b[4], 5);
+ c[5] = vrshrq_n_s16(b[5], 5);
+ c[6] = vrshrq_n_s16(b[6], 5);
+ c[7] = vrshrq_n_s16(b[7], 5);
+ } else {
+ const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24
+ const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
+
+ if (bd == 10) {
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5], &a[6], &a[7]);
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+ &a[12], &a[13], &a[14], &a[15]);
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+ &a[2], &a[10], &a[3], &a[11]);
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+ &a[6], &a[14], &a[7], &a[15]);
+ } else {
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5], &a[6], &a[7]);
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+ &a[12], &a[13], &a[14], &a[15]);
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+ &a[2], &a[10], &a[3], &a[11]);
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+ &a[6], &a[14], &a[7], &a[15]);
+ }
+ c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+ c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+ c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+ c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+ c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+ c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+ c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+ c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
+ }
+ highbd_add8x8(c, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h
new file mode 100644
index 0000000000..518ef4336e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x4_t a0 = vld1_u16(*dest);
+ const uint16x4_t a1 = vld1_u16(*dest + stride);
+ const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
+ // Note: In some profile tests, res is quite close to +/-32767.
+ // We use saturating addition.
+ const int16x8_t b = vqaddq_s16(res, a);
+ const int16x8_t c = vminq_s16(b, max);
+ const uint16x8_t d = vqshluq_n_s16(c, 0);
+ vst1_u16(*dest, vget_low_u16(d));
+ *dest += stride;
+ vst1_u16(*dest, vget_high_u16(d));
+ *dest += stride;
+}
+
+static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
+ int32x4_t *const a) {
+ int32x4_t b0, b1, b2, b3;
+
+ transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
+ b0 = vaddq_s32(a[0], a[2]);
+ b1 = vsubq_s32(a[0], a[2]);
+ b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
+ b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
+ b2 = vmulq_lane_s32(a[1], vget_high_s32(cospis), 1);
+ b3 = vmulq_lane_s32(a[1], vget_low_s32(cospis), 1);
+ b2 = vmlsq_lane_s32(b2, a[3], vget_low_s32(cospis), 1);
+ b3 = vmlaq_lane_s32(b3, a[3], vget_high_s32(cospis), 1);
+ b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
+ b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
+ b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
+ b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
+ a[0] = vaddq_s32(b0, b3);
+ a[1] = vaddq_s32(b1, b2);
+ a[2] = vsubq_s32(b1, b2);
+ a[3] = vsubq_s32(b0, b3);
+}
+
+static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
+ int32x4_t *const a) {
+ int32x4_t b0, b1, b2, b3;
+ int64x2_t c[12];
+
+ transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
+ b0 = vaddq_s32(a[0], a[2]);
+ b1 = vsubq_s32(a[0], a[2]);
+ c[0] = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
+ c[1] = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
+ c[2] = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
+ c[3] = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
+ c[4] = vmull_lane_s32(vget_low_s32(a[1]), vget_high_s32(cospis), 1);
+ c[5] = vmull_lane_s32(vget_high_s32(a[1]), vget_high_s32(cospis), 1);
+ c[6] = vmull_lane_s32(vget_low_s32(a[1]), vget_low_s32(cospis), 1);
+ c[7] = vmull_lane_s32(vget_high_s32(a[1]), vget_low_s32(cospis), 1);
+ c[8] = vmull_lane_s32(vget_low_s32(a[3]), vget_low_s32(cospis), 1);
+ c[9] = vmull_lane_s32(vget_high_s32(a[3]), vget_low_s32(cospis), 1);
+ c[10] = vmull_lane_s32(vget_low_s32(a[3]), vget_high_s32(cospis), 1);
+ c[11] = vmull_lane_s32(vget_high_s32(a[3]), vget_high_s32(cospis), 1);
+ c[4] = vsubq_s64(c[4], c[8]);
+ c[5] = vsubq_s64(c[5], c[9]);
+ c[6] = vaddq_s64(c[6], c[10]);
+ c[7] = vaddq_s64(c[7], c[11]);
+ b0 = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
+ vrshrn_n_s64(c[1], DCT_CONST_BITS));
+ b1 = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
+ vrshrn_n_s64(c[3], DCT_CONST_BITS));
+ b2 = vcombine_s32(vrshrn_n_s64(c[4], DCT_CONST_BITS),
+ vrshrn_n_s64(c[5], DCT_CONST_BITS));
+ b3 = vcombine_s32(vrshrn_n_s64(c[6], DCT_CONST_BITS),
+ vrshrn_n_s64(c[7], DCT_CONST_BITS));
+ a[0] = vaddq_s32(b0, b3);
+ a[1] = vaddq_s32(b1, b2);
+ a[2] = vsubq_s32(b1, b2);
+ a[3] = vsubq_s32(b0, b3);
+}
+
+static INLINE void highbd_add8x8(int16x8_t *const a, uint16_t *dest,
+ const int stride, const int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ const uint16_t *dst = dest;
+ uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+ uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
+ int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
+
+ d0 = vld1q_u16(dst);
+ dst += stride;
+ d1 = vld1q_u16(dst);
+ dst += stride;
+ d2 = vld1q_u16(dst);
+ dst += stride;
+ d3 = vld1q_u16(dst);
+ dst += stride;
+ d4 = vld1q_u16(dst);
+ dst += stride;
+ d5 = vld1q_u16(dst);
+ dst += stride;
+ d6 = vld1q_u16(dst);
+ dst += stride;
+ d7 = vld1q_u16(dst);
+
+ d0_s16 = vqaddq_s16(a[0], vreinterpretq_s16_u16(d0));
+ d1_s16 = vqaddq_s16(a[1], vreinterpretq_s16_u16(d1));
+ d2_s16 = vqaddq_s16(a[2], vreinterpretq_s16_u16(d2));
+ d3_s16 = vqaddq_s16(a[3], vreinterpretq_s16_u16(d3));
+ d4_s16 = vqaddq_s16(a[4], vreinterpretq_s16_u16(d4));
+ d5_s16 = vqaddq_s16(a[5], vreinterpretq_s16_u16(d5));
+ d6_s16 = vqaddq_s16(a[6], vreinterpretq_s16_u16(d6));
+ d7_s16 = vqaddq_s16(a[7], vreinterpretq_s16_u16(d7));
+
+ d0_s16 = vminq_s16(d0_s16, max);
+ d1_s16 = vminq_s16(d1_s16, max);
+ d2_s16 = vminq_s16(d2_s16, max);
+ d3_s16 = vminq_s16(d3_s16, max);
+ d4_s16 = vminq_s16(d4_s16, max);
+ d5_s16 = vminq_s16(d5_s16, max);
+ d6_s16 = vminq_s16(d6_s16, max);
+ d7_s16 = vminq_s16(d7_s16, max);
+ d0_u16 = vqshluq_n_s16(d0_s16, 0);
+ d1_u16 = vqshluq_n_s16(d1_s16, 0);
+ d2_u16 = vqshluq_n_s16(d2_s16, 0);
+ d3_u16 = vqshluq_n_s16(d3_s16, 0);
+ d4_u16 = vqshluq_n_s16(d4_s16, 0);
+ d5_u16 = vqshluq_n_s16(d5_s16, 0);
+ d6_u16 = vqshluq_n_s16(d6_s16, 0);
+ d7_u16 = vqshluq_n_s16(d7_s16, 0);
+
+ vst1q_u16(dest, d0_u16);
+ dest += stride;
+ vst1q_u16(dest, d1_u16);
+ dest += stride;
+ vst1q_u16(dest, d2_u16);
+ dest += stride;
+ vst1q_u16(dest, d3_u16);
+ dest += stride;
+ vst1q_u16(dest, d4_u16);
+ dest += stride;
+ vst1q_u16(dest, d5_u16);
+ dest += stride;
+ vst1q_u16(dest, d6_u16);
+ dest += stride;
+ vst1q_u16(dest, d7_u16);
+}
+
+static INLINE void idct8x8_64_half1d_bd10(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x4_t step1[8], step2[8];
+
+ transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+ // stage 1
+ step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+ step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+ step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+ step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+
+ step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
+ step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
+ step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
+ step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
+
+ step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+ step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
+
+ // stage 2
+ step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+ step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+ step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+
+ step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+ step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+ step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
+ step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
+
+ step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
+ step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+ step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+ step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[0], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[0], step2[3]);
+
+ step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+ step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_half1d_bd12(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x2_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
+ input7h;
+ int32x2_t step1l[4], step1h[4];
+ int32x4_t step1[8], step2[8];
+ int64x2_t t64[8];
+ int32x2_t t32[8];
+
+ transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+ // stage 1
+ input1l = vget_low_s32(*io1);
+ input1h = vget_high_s32(*io1);
+ input3l = vget_low_s32(*io3);
+ input3h = vget_high_s32(*io3);
+ input5l = vget_low_s32(*io5);
+ input5h = vget_high_s32(*io5);
+ input7l = vget_low_s32(*io7);
+ input7h = vget_high_s32(*io7);
+ step1l[0] = vget_low_s32(*io0);
+ step1h[0] = vget_high_s32(*io0);
+ step1l[1] = vget_low_s32(*io2);
+ step1h[1] = vget_high_s32(*io2);
+ step1l[2] = vget_low_s32(*io4);
+ step1h[2] = vget_high_s32(*io4);
+ step1l[3] = vget_low_s32(*io6);
+ step1h[3] = vget_high_s32(*io6);
+
+ t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
+ t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
+ t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
+ t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
+ t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
+ t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
+ t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
+ t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
+ t64[0] = vmlsl_lane_s32(t64[0], input7l, vget_low_s32(cospis1), 0);
+ t64[1] = vmlsl_lane_s32(t64[1], input7h, vget_low_s32(cospis1), 0);
+ t64[2] = vmlal_lane_s32(t64[2], input5l, vget_low_s32(cospis1), 1);
+ t64[3] = vmlal_lane_s32(t64[3], input5h, vget_low_s32(cospis1), 1);
+ t64[4] = vmlsl_lane_s32(t64[4], input5l, vget_high_s32(cospis1), 0);
+ t64[5] = vmlsl_lane_s32(t64[5], input5h, vget_high_s32(cospis1), 0);
+ t64[6] = vmlal_lane_s32(t64[6], input7l, vget_high_s32(cospis1), 1);
+ t64[7] = vmlal_lane_s32(t64[7], input7h, vget_high_s32(cospis1), 1);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+ step1[4] = vcombine_s32(t32[0], t32[1]);
+ step1[5] = vcombine_s32(t32[2], t32[3]);
+ step1[6] = vcombine_s32(t32[4], t32[5]);
+ step1[7] = vcombine_s32(t32[6], t32[7]);
+
+ // stage 2
+ t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+ t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+ t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+ t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+ t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+ t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+ t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+ t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+ t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+ t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
+ t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
+ t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
+ t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+ step2[0] = vcombine_s32(t32[0], t32[1]);
+ step2[1] = vcombine_s32(t32[2], t32[3]);
+ step2[2] = vcombine_s32(t32[4], t32[5]);
+ step2[3] = vcombine_s32(t32[6], t32[7]);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[0], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[0], step2[3]);
+
+ t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[0] =
+ vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t64[2] =
+ vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ step1[5] = vcombine_s32(t32[0], t32[1]);
+ step1[6] = vcombine_s32(t32[2], t32[3]);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out,
+ int32_t *output) {
+ // Save the result into output
+ vst1q_s32(output + 0, out[0].val[0]);
+ vst1q_s32(output + 4, out[0].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[1].val[0]);
+ vst1q_s32(output + 4, out[1].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[2].val[0]);
+ vst1q_s32(output + 4, out[2].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[3].val[0]);
+ vst1q_s32(output + 4, out[3].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[4].val[0]);
+ vst1q_s32(output + 4, out[4].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[5].val[0]);
+ vst1q_s32(output + 4, out[5].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[6].val[0]);
+ vst1q_s32(output + 4, out[6].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[7].val[0]);
+ vst1q_s32(output + 4, out[7].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[8].val[0]);
+ vst1q_s32(output + 4, out[8].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[9].val[0]);
+ vst1q_s32(output + 4, out[9].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[10].val[0]);
+ vst1q_s32(output + 4, out[10].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[11].val[0]);
+ vst1q_s32(output + 4, out[11].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[12].val[0]);
+ vst1q_s32(output + 4, out[12].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[13].val[0]);
+ vst1q_s32(output + 4, out[13].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[14].val[0]);
+ vst1q_s32(output + 4, out[14].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[15].val[0]);
+ vst1q_s32(output + 4, out[15].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
+ uint16_t *dest, const int stride,
+ const int bd) {
+ // Add the result to dest
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ int16x8_t o[16];
+ o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
+ vrshrn_n_s32(out[0].val[1], 6));
+ o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
+ vrshrn_n_s32(out[1].val[1], 6));
+ o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
+ vrshrn_n_s32(out[2].val[1], 6));
+ o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
+ vrshrn_n_s32(out[3].val[1], 6));
+ o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
+ vrshrn_n_s32(out[4].val[1], 6));
+ o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
+ vrshrn_n_s32(out[5].val[1], 6));
+ o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
+ vrshrn_n_s32(out[6].val[1], 6));
+ o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
+ vrshrn_n_s32(out[7].val[1], 6));
+ o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
+ vrshrn_n_s32(out[8].val[1], 6));
+ o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
+ vrshrn_n_s32(out[9].val[1], 6));
+ o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
+ vrshrn_n_s32(out[10].val[1], 6));
+ o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
+ vrshrn_n_s32(out[11].val[1], 6));
+ o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
+ vrshrn_n_s32(out[12].val[1], 6));
+ o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
+ vrshrn_n_s32(out[13].val[1], 6));
+ o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
+ vrshrn_n_s32(out[14].val[1], 6));
+ o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
+ vrshrn_n_s32(out[15].val[1], 6));
+ highbd_idct16x16_add8x1(o[0], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[1], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[2], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[3], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[4], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[5], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[6], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[7], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[8], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[9], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[10], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[11], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[12], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[13], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[14], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[15], max, &dest, stride);
+}
+
+void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
+ uint16_t *dest, const int stride,
+ const int bd);
+
+#endif // VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
new file mode 100644
index 0000000000..235cb5b996
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -0,0 +1,2514 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "sum_neon.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16_t dc_sum_4(const uint16_t *ref) {
+ const uint16x4_t ref_u16 = vld1_u16(ref);
+ return horizontal_add_uint16x4(ref_u16);
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ const uint16x4_t dc) {
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ vst1_u16(dst, dc);
+ }
+}
+
+void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t a = vld1_u16(above);
+ const uint16x4_t l = vld1_u16(left);
+ const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l));
+ const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3);
+ (void)bd;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_4(left);
+ const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
+ (void)above;
+ (void)bd;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_4(above);
+ const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16_t dc_sum_8(const uint16_t *ref) {
+ const uint16x8_t ref_u16 = vld1q_u16(ref);
+ return horizontal_add_uint16x8(ref_u16);
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+ const uint16x8_t dc) {
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ vst1q_u16(dst, dc);
+ }
+}
+
+void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t above_u16 = vld1q_u16(above);
+ const uint16x8_t left_u16 = vld1q_u16(left);
+ const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
+ const uint16_t sum = horizontal_add_uint16x8(p0);
+ const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
+ (void)bd;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_8(left);
+ const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
+ (void)above;
+ (void)bd;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_8(above);
+ const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
+ (void)left;
+ (void)bd;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_8x8(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16_t dc_sum_16(const uint16_t *ref) {
+ const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0);
+ const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8);
+ const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1);
+ return horizontal_add_uint16x8(p0);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+ const uint16x8_t dc) {
+ int i;
+ for (i = 0; i < 16; ++i, dst += stride) {
+ vst1q_u16(dst + 0, dc);
+ vst1q_u16(dst + 8, dc);
+ }
+}
+
+void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t a0 = vld1q_u16(above + 0);
+ const uint16x8_t a1 = vld1q_u16(above + 8);
+ const uint16x8_t l0 = vld1q_u16(left + 0);
+ const uint16x8_t l1 = vld1q_u16(left + 8);
+ const uint16x8_t pa = vaddq_u16(a0, a1);
+ const uint16x8_t pl = vaddq_u16(l0, l1);
+ const uint16x8_t pal0 = vaddq_u16(pa, pl);
+ const uint32_t sum = horizontal_add_uint16x8(pal0);
+ const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
+ (void)bd;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_16(left);
+ const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
+ (void)above;
+ (void)bd;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_16(above);
+ const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_16x16(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint32_t dc_sum_32(const uint16_t *ref) {
+ const uint16x8_t r0 = vld1q_u16(ref + 0);
+ const uint16x8_t r1 = vld1q_u16(ref + 8);
+ const uint16x8_t r2 = vld1q_u16(ref + 16);
+ const uint16x8_t r3 = vld1q_u16(ref + 24);
+ const uint16x8_t p0 = vaddq_u16(r0, r1);
+ const uint16x8_t p1 = vaddq_u16(r2, r3);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ return horizontal_add_uint16x8(p2);
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+ const uint16x8_t dc) {
+ int i;
+ for (i = 0; i < 32; ++i) {
+ vst1q_u16(dst + 0, dc);
+ vst1q_u16(dst + 8, dc);
+ vst1q_u16(dst + 16, dc);
+ vst1q_u16(dst + 24, dc);
+ dst += stride;
+ }
+}
+
+void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t a0 = vld1q_u16(above + 0);
+ const uint16x8_t a1 = vld1q_u16(above + 8);
+ const uint16x8_t a2 = vld1q_u16(above + 16);
+ const uint16x8_t a3 = vld1q_u16(above + 24);
+ const uint16x8_t l0 = vld1q_u16(left + 0);
+ const uint16x8_t l1 = vld1q_u16(left + 8);
+ const uint16x8_t l2 = vld1q_u16(left + 16);
+ const uint16x8_t l3 = vld1q_u16(left + 24);
+ const uint16x8_t pa0 = vaddq_u16(a0, a1);
+ const uint16x8_t pa1 = vaddq_u16(a2, a3);
+ const uint16x8_t pl0 = vaddq_u16(l0, l1);
+ const uint16x8_t pl1 = vaddq_u16(l2, l3);
+ const uint16x8_t pa = vaddq_u16(pa0, pa1);
+ const uint16x8_t pl = vaddq_u16(pl0, pl1);
+ const uint16x8_t pal0 = vaddq_u16(pa, pl);
+ const uint32_t sum = horizontal_add_uint16x8(pal0);
+ const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0);
+ (void)bd;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint32_t sum = dc_sum_32(left);
+ const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
+ (void)above;
+ (void)bd;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint32_t sum = dc_sum_32(above);
+ const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
+ (void)left;
+ (void)bd;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_32x32(dst, stride, dc);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t a0, a1, a2, d0;
+ uint16_t a7;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above);
+ a7 = above[7];
+
+ // [ above[1], ..., above[6], x, x ]
+ a1 = vextq_u16(a0, a0, 1);
+ // [ above[2], ..., above[7], x, x ]
+ a2 = vextq_u16(a0, a0, 2);
+
+ // d0[0] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[5] = AVG3(above[5], above[6], above[7]);
+ // d0[6] = x (don't care)
+ // d0[7] = x (don't care)
+ d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+ // We want:
+ // stride=0 [ d0[0], d0[1], d0[2], d0[3] ]
+ // stride=1 [ d0[1], d0[2], d0[3], d0[4] ]
+ // stride=2 [ d0[2], d0[3], d0[4], d0[5] ]
+ // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+ vst1_u16(dst + 0 * stride, vget_low_u16(d0));
+ vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1)));
+ vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2)));
+ vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3)));
+
+ // We stored d0[6] above, so fixup into above[7].
+ dst[3 * stride + 3] = a7;
+}
+
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t ax0, a0, a1, a7, d0;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a7 = vld1q_dup_u16(above + 7);
+
+ // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+ // shift in above[7] later, so shift a0 across by one to get the right
+ // inputs:
+ // [ x, above[0], ... , above[6] ]
+ ax0 = vextq_u16(a0, a0, 7);
+
+ // d0[0] = x (don't care)
+ // d0[1] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[7] = AVG3(above[6], above[7], above[8]);
+ d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+
+ // Undo the earlier ext, incrementally shift in duplicates of above[7].
+ vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1));
+ vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2));
+ vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3));
+ vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4));
+ vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5));
+ vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6));
+ vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7));
+ vst1q_u16(dst + 7 * stride, a7);
+}
+
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2];
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ a9 = vld1q_u16(above + 9);
+ a15 = vld1q_dup_u16(above + 15);
+
+ // [ x, above[0], ... , above[6] ]
+ ax0 = vextq_u16(a0, a0, 7);
+
+ // We have one unused lane here to leave room to shift in above[15] in the
+ // last lane:
+ // d0[0][1] = x (don't care)
+ // d0[0][1] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[0][7] = AVG3(above[6], above[7], above[8]);
+ // d0[1][0] = AVG3(above[7], above[8], above[9]);
+ // ...
+ // d0[1][7] = AVG3(above[14], above[15], above[16]);
+ d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+ d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+
+ // Incrementally shift in duplicates of above[15].
+ vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+ vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1));
+ vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+ vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2));
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4));
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7));
+ vst1q_u16(dst + 7 * stride + 0, d0[1]);
+ vst1q_u16(dst + 7 * stride + 8, a15);
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1));
+ vst1q_u16(dst + 8 * stride + 8, a15);
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2));
+ vst1q_u16(dst + 9 * stride + 8, a15);
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3));
+ vst1q_u16(dst + 10 * stride + 8, a15);
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4));
+ vst1q_u16(dst + 11 * stride + 8, a15);
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5));
+ vst1q_u16(dst + 12 * stride + 8, a15);
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6));
+ vst1q_u16(dst + 13 * stride + 8, a15);
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7));
+ vst1q_u16(dst + 14 * stride + 8, a15);
+ vst1q_u16(dst + 15 * stride + 0, a15);
+ vst1q_u16(dst + 15 * stride + 8, a15);
+}
+
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4];
+ int i;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ a9 = vld1q_u16(above + 9);
+ a15 = vld1q_u16(above + 15);
+ a16 = vld1q_u16(above + 16);
+ a17 = vld1q_u16(above + 17);
+ a23 = vld1q_u16(above + 23);
+ a24 = vld1q_u16(above + 24);
+ a25 = vld1q_u16(above + 25);
+ a31 = vld1q_dup_u16(above + 31);
+
+ // [ x, above[0], ... , above[6] ]
+ ax0 = vextq_u16(a0, a0, 7);
+
+ d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+ d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+ d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16);
+ d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24);
+
+ for (i = 0; i < 32; ++i) {
+ d0[0] = vextq_u16(d0[0], d0[1], 1);
+ d0[1] = vextq_u16(d0[1], d0[2], 1);
+ d0[2] = vextq_u16(d0[2], d0[3], 1);
+ d0[3] = vextq_u16(d0[3], a31, 1);
+ vst1q_u16(dst + 0, d0[0]);
+ vst1q_u16(dst + 8, d0[1]);
+ vst1q_u16(dst + 16, d0[2]);
+ vst1q_u16(dst + 24, d0[3]);
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1_u16(above + 0);
+ a1 = vld1_u16(above + 1);
+ a2 = vld1_u16(above + 2);
+ a3 = vld1_u16(above + 3);
+
+ d0 = vrhadd_u16(a0, a1);
+ d1 = vrhadd_u16(vhadd_u16(a0, a2), a1);
+ d2 = vrhadd_u16(a1, a2);
+ d3 = vrhadd_u16(vhadd_u16(a1, a3), a2);
+
+ // Note that here we are performing a full avg calculation for the final
+ // elements rather than storing a duplicate of above[3], which differs
+ // (correctly) from the general scheme employed by the bs={8,16,32}
+ // implementations in order to match the original C implementation.
+ vst1_u16(dst + 0 * stride, d0);
+ vst1_u16(dst + 1 * stride, d1);
+ vst1_u16(dst + 2 * stride, d2);
+ vst1_u16(dst + 3 * stride, d3);
+}
+
+void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a2 = vld1q_u16(above + 2);
+ a7 = vld1q_dup_u16(above + 7);
+
+ d0 = vrhaddq_u16(a0, a1);
+ d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+ // We want to store:
+ // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
+ // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
+ // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7] ]
+ // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7] ]
+ // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7] ]
+ // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7] ]
+ // stride=6 [ d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7], a[7] ]
+ // stride=7 [ d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7], a[7] ]
+ // Note in particular that d0[7] and d1[7] are only ever referenced in the
+ // stride=0 and stride=1 cases respectively, and in later strides are
+ // replaced by a copy of above[7]. These are equivalent if for i>7,
+ // above[i]==above[7], however that is not always the case.
+
+ // Strip out d0[7] and d1[7] so that we can replace it with an additional
+ // copy of above[7], the first vector here doesn't matter so just reuse
+ // d0/d1.
+ d0_ext = vextq_u16(d0, d0, 7);
+ d1_ext = vextq_u16(d1, d1, 7);
+
+ // Shuffle in duplicates of above[7] and store.
+ vst1q_u16(dst + 0 * stride, d0);
+ vst1q_u16(dst + 1 * stride, d1);
+ vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2));
+ vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2));
+ vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3));
+ vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3));
+ vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4));
+ vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4));
+}
+
+void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+ uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a2 = vld1q_u16(above + 2);
+ a8 = vld1q_u16(above + 8);
+ a9 = vld1q_u16(above + 9);
+ a10 = vld1q_u16(above + 10);
+ a15 = vld1q_dup_u16(above + 15);
+
+ d0[0] = vrhaddq_u16(a0, a1);
+ d0[1] = vrhaddq_u16(a8, a9);
+ d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+ d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+
+ // Strip out the final element of d0/d1 so that we can replace it with an
+ // additional copy of above[7], the first vector here doesn't matter so just
+ // reuse the same vector.
+ d0_ext = vextq_u16(d0[1], d0[1], 7);
+ d1_ext = vextq_u16(d1[1], d1[1], 7);
+
+ // Shuffle in duplicates of above[7] and store. Note that cases involving
+ // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+ // element from above.
+ vst1q_u16(dst + 0 * stride + 0, d0[0]);
+ vst1q_u16(dst + 0 * stride + 8, d0[1]);
+ vst1q_u16(dst + 1 * stride + 0, d1[0]);
+ vst1q_u16(dst + 1 * stride + 8, d1[1]);
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2));
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4));
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5));
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6));
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7));
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+ vst1q_u16(dst + 14 * stride + 8, a15);
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 15 * stride + 8, a15);
+}
+
+void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+ uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4],
+ d1[4], d0_ext, d1_ext;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a2 = vld1q_u16(above + 2);
+ a8 = vld1q_u16(above + 8);
+ a9 = vld1q_u16(above + 9);
+ a10 = vld1q_u16(above + 10);
+ a16 = vld1q_u16(above + 16);
+ a17 = vld1q_u16(above + 17);
+ a18 = vld1q_u16(above + 18);
+ a24 = vld1q_u16(above + 24);
+ a25 = vld1q_u16(above + 25);
+ a26 = vld1q_u16(above + 26);
+ a31 = vld1q_dup_u16(above + 31);
+
+ d0[0] = vrhaddq_u16(a0, a1);
+ d0[1] = vrhaddq_u16(a8, a9);
+ d0[2] = vrhaddq_u16(a16, a17);
+ d0[3] = vrhaddq_u16(a24, a25);
+ d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+ d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+ d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17);
+ d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25);
+
+ // Strip out the final element of d0/d1 so that we can replace it with an
+ // additional copy of above[7], the first vector here doesn't matter so just
+ // reuse the same vector.
+ d0_ext = vextq_u16(d0[3], d0[3], 7);
+ d1_ext = vextq_u16(d1[3], d1[3], 7);
+
+ // Shuffle in duplicates of above[7] and store. Note that cases involving
+ // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+ // element from above.
+
+ vst1q_u16(dst + 0 * stride + 0, d0[0]);
+ vst1q_u16(dst + 0 * stride + 8, d0[1]);
+ vst1q_u16(dst + 0 * stride + 16, d0[2]);
+ vst1q_u16(dst + 0 * stride + 24, d0[3]);
+ vst1q_u16(dst + 1 * stride + 0, d1[0]);
+ vst1q_u16(dst + 1 * stride + 8, d1[1]);
+ vst1q_u16(dst + 1 * stride + 16, d1[2]);
+ vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1));
+ vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1));
+ vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1));
+ vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1));
+ vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2));
+
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2));
+ vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2));
+ vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2));
+ vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2));
+ vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3));
+
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3));
+ vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3));
+ vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3));
+ vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3));
+ vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4));
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4));
+ vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4));
+ vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4));
+ vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4));
+ vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5));
+
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5));
+ vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5));
+ vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5));
+ vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5));
+ vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6));
+
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6));
+ vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6));
+ vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6));
+ vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6));
+ vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7));
+
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7));
+ vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7));
+ vst1q_u16(dst + 14 * stride + 24, a31);
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7));
+ vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7));
+ vst1q_u16(dst + 15 * stride + 24, a31);
+
+ vst1q_u16(dst + 16 * stride + 0, d0[1]);
+ vst1q_u16(dst + 16 * stride + 8, d0[2]);
+ vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1));
+ vst1q_u16(dst + 16 * stride + 24, a31);
+ vst1q_u16(dst + 17 * stride + 0, d1[1]);
+ vst1q_u16(dst + 17 * stride + 8, d1[2]);
+ vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1));
+ vst1q_u16(dst + 17 * stride + 24, a31);
+
+ vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1));
+ vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1));
+ vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2));
+ vst1q_u16(dst + 18 * stride + 24, a31);
+ vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1));
+ vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1));
+ vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2));
+ vst1q_u16(dst + 19 * stride + 24, a31);
+
+ vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2));
+ vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2));
+ vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3));
+ vst1q_u16(dst + 20 * stride + 24, a31);
+ vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2));
+ vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2));
+ vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3));
+ vst1q_u16(dst + 21 * stride + 24, a31);
+
+ vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3));
+ vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3));
+ vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4));
+ vst1q_u16(dst + 22 * stride + 24, a31);
+ vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3));
+ vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3));
+ vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4));
+ vst1q_u16(dst + 23 * stride + 24, a31);
+
+ vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4));
+ vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4));
+ vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5));
+ vst1q_u16(dst + 24 * stride + 24, a31);
+ vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4));
+ vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4));
+ vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5));
+ vst1q_u16(dst + 25 * stride + 24, a31);
+
+ vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5));
+ vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5));
+ vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6));
+ vst1q_u16(dst + 26 * stride + 24, a31);
+ vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5));
+ vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5));
+ vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6));
+ vst1q_u16(dst + 27 * stride + 24, a31);
+
+ vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6));
+ vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6));
+ vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7));
+ vst1q_u16(dst + 28 * stride + 24, a31);
+ vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6));
+ vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6));
+ vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7));
+ vst1q_u16(dst + 29 * stride + 24, a31);
+
+ vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7));
+ vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7));
+ vst1q_u16(dst + 30 * stride + 16, a31);
+ vst1q_u16(dst + 30 * stride + 24, a31);
+ vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7));
+ vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7));
+ vst1q_u16(dst + 31 * stride + 16, a31);
+ vst1q_u16(dst + 31 * stride + 24, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+ (void)bd;
+
+ az = vld1_u16(above - 1);
+ a0 = vld1_u16(above + 0);
+ // [ left[0], above[-1], above[0], above[1] ]
+ l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+ l0 = vld1_u16(left + 0);
+ // The last lane here is unused, reading left[4] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], left[2], left[3], x ]
+ l1 = vext_u16(l0, l0, 1);
+ // [ above[-1], left[0], left[1], left[2] ]
+ azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+ d0 = vrhadd_u16(az, a0);
+ d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+
+ col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+ col0_even = vdup_lane_u16(col0, 0);
+ col0_odd = vdup_lane_u16(col0, 1);
+
+ vst1_u16(dst + 0 * stride, d0);
+ vst1_u16(dst + 1 * stride, d1);
+ vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3));
+ vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3));
+}
+
+void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ // [ left[0], above[-1], ..., left[5] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left + 0);
+ // The last lane here is unused, reading left[8] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[7], x ]
+ l1 = vextq_u16(l0, l0, 1);
+ // [ above[-1], left[0], ..., left[6] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ // d0[0] = AVG2(above[-1], above[0])
+ // ...
+ // d0[7] = AVG2(above[6], above[7])
+ d0 = vrhaddq_u16(az, a0);
+
+ // d1[0] = AVG3(left[0], above[-1], above[0])
+ // d1[1] = AVG3(above[-1], above[0], above[1])
+ // ...
+ // d1[7] = AVG3(above[5], above[6], above[7])
+ d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+ // The ext instruction shifts elements in from the end of the vector rather
+ // than the start, so reverse the vector to put the elements to be shifted in
+ // at the end:
+ // col0[7] = AVG3(above[-1], left[0], left[1])
+ // col0[6] = AVG3(left[0], left[1], left[2])
+ // ...
+ // col0[0] = AVG3(left[6], left[7], left[8])
+ col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+ col0 = vrev64q_u16(vextq_u16(col0, col0, 4));
+
+ // We don't care about the first parameter to this uzp since we only ever use
+ // the high three elements, we just use col0 again since it is already
+ // available:
+ // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+ // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+ col0_even = vuzpq_u16(col0, col0).val[1];
+ col0_odd = vuzpq_u16(col0, col0).val[0];
+
+ // Incrementally shift more elements from col0 into d0/1:
+ // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
+ // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
+ // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ]
+ // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+ // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ]
+ // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ]
+ // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+ // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+ vst1q_u16(dst + 0 * stride, d0);
+ vst1q_u16(dst + 1 * stride, d1);
+ vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7));
+ vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7));
+ vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6));
+ vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6));
+ vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5));
+ vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5));
+}
+
+void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo,
+ col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi;
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ a6 = vld1q_u16(above + 6);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ // [ left[0], above[-1], ..., left[5] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l7 = vld1q_u16(left + 7);
+ l8 = vld1q_u16(left + 8);
+ // The last lane here is unused, reading left[16] could cause a buffer
+ // over-read, so just fill with a duplicate of left[8] to avoid needing to
+ // materialize a zero:
+ // [ left[9], ... , left[15], x ]
+ l9 = vextq_u16(l8, l8, 1);
+ // [ above[-1], left[0], ..., left[6] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ d0_lo = vrhaddq_u16(az, a0);
+ d0_hi = vrhaddq_u16(a7, a8);
+ d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+ d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+
+ col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+ col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+ // Reverse within each vector, then swap the array indices in the uzp to
+ // complete the reversal across all 16 elements.
+ col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4));
+ col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4));
+ col0_even = vuzpq_u16(col0_hi, col0_lo).val[1];
+ col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0];
+
+ vst1q_u16(dst + 0 * stride + 0, d0_lo);
+ vst1q_u16(dst + 0 * stride + 8, d0_hi);
+ vst1q_u16(dst + 1 * stride + 0, d1_lo);
+ vst1q_u16(dst + 1 * stride + 8, d1_hi);
+
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7));
+
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6));
+
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5));
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4));
+
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3));
+
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2));
+
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1));
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1));
+}
+
+void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+ l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4],
+ col0_even[2], col0_odd[2];
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ a6 = vld1q_u16(above + 6);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ a14 = vld1q_u16(above + 14);
+ a15 = vld1q_u16(above + 15);
+ a16 = vld1q_u16(above + 16);
+ a22 = vld1q_u16(above + 22);
+ a23 = vld1q_u16(above + 23);
+ a24 = vld1q_u16(above + 24);
+ // [ left[0], above[-1], ..., left[5] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l7 = vld1q_u16(left + 7);
+ l8 = vld1q_u16(left + 8);
+ l9 = vld1q_u16(left + 9);
+ l15 = vld1q_u16(left + 15);
+ l16 = vld1q_u16(left + 16);
+ l17 = vld1q_u16(left + 17);
+ l23 = vld1q_u16(left + 23);
+ l24 = vld1q_u16(left + 24);
+ l25 = vld1q_u16(left + 25);
+ // The last lane here is unused, reading left[32] could cause a buffer
+ // over-read, so just fill with a duplicate of left[24] to avoid needing to
+ // materialize a zero:
+ // [ left[25], ... , left[31], x ]
+ l25 = vextq_u16(l24, l24, 1);
+ // [ above[-1], left[0], ..., left[6] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ d0[0] = vrhaddq_u16(az, a0);
+ d0[1] = vrhaddq_u16(a7, a8);
+ d0[2] = vrhaddq_u16(a15, a16);
+ d0[3] = vrhaddq_u16(a23, a24);
+ d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+ d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+ d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+ d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+ col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+ col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+ col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+ col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+ // Reverse within each vector, then swap the array indices in both the uzp
+ // and the col0_{even,odd} assignment to complete the reversal across all
+ // 32-elements.
+ col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4));
+ col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4));
+ col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4));
+ col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4));
+
+ col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1];
+ col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1];
+ col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0];
+ col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0];
+
+ vst1q_u16(dst + 0 * stride + 0, d0[0]);
+ vst1q_u16(dst + 0 * stride + 8, d0[1]);
+ vst1q_u16(dst + 0 * stride + 16, d0[2]);
+ vst1q_u16(dst + 0 * stride + 24, d0[3]);
+ vst1q_u16(dst + 1 * stride + 0, d1[0]);
+ vst1q_u16(dst + 1 * stride + 8, d1[1]);
+ vst1q_u16(dst + 1 * stride + 16, d1[2]);
+ vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7));
+ vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7));
+ vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+ vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6));
+ vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6));
+ vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6));
+ vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6));
+ vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6));
+
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5));
+ vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5));
+ vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+ vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4));
+ vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4));
+ vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4));
+ vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4));
+ vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4));
+
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3));
+ vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3));
+ vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+ vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2));
+ vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2));
+ vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2));
+ vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2));
+ vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2));
+
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1));
+ vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1));
+ vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1));
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+ vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+ vst1q_u16(dst + 16 * stride + 0, col0_even[1]);
+ vst1q_u16(dst + 16 * stride + 8, d0[0]);
+ vst1q_u16(dst + 16 * stride + 16, d0[1]);
+ vst1q_u16(dst + 16 * stride + 24, d0[2]);
+ vst1q_u16(dst + 17 * stride + 0, col0_odd[1]);
+ vst1q_u16(dst + 17 * stride + 8, d1[0]);
+ vst1q_u16(dst + 17 * stride + 16, d1[1]);
+ vst1q_u16(dst + 17 * stride + 24, d1[2]);
+
+ vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7));
+ vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7));
+ vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7));
+ vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7));
+ vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7));
+ vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7));
+ vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+
+ vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6));
+ vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6));
+ vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6));
+ vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6));
+ vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6));
+ vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6));
+ vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6));
+ vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6));
+
+ vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5));
+ vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5));
+ vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5));
+ vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5));
+ vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5));
+ vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5));
+ vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+
+ vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4));
+ vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4));
+ vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4));
+ vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4));
+ vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4));
+ vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4));
+ vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4));
+ vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4));
+
+ vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3));
+ vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3));
+ vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3));
+ vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3));
+ vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3));
+ vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3));
+ vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+
+ vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2));
+ vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2));
+ vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2));
+ vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2));
+ vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2));
+ vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2));
+ vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2));
+ vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2));
+
+ vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1));
+ vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1));
+ vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1));
+ vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1));
+ vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1));
+ vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1));
+ vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+ uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi;
+ (void)bd;
+
+ az = vld1_u16(above - 1);
+ a0 = vld1_u16(above + 0);
+ // [ left[0], above[-1], above[0], above[1] ]
+ l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+ l0 = vld1_u16(left);
+ // The last lane here is unused, reading left[4] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], left[2], left[3], x ]
+ l1 = vext_u16(l0, l0, 1);
+ // [ above[-1], left[0], left[1], left[2] ]
+ azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+ d0 = vrhadd_u16(azl0, l0);
+ d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+ d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+
+ d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0];
+ d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1];
+
+ // Incrementally shift more elements from d0/d2 reversed into d1:
+ // stride=0 [ d0[0], d1[0], d1[1], d1[2] ]
+ // stride=1 [ d0[1], d2[0], d0[0], d1[0] ]
+ // stride=2 [ d0[2], d2[1], d0[1], d2[0] ]
+ // stride=3 [ d0[3], d2[2], d0[2], d2[1] ]
+ vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3));
+ vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1));
+ vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3));
+ vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo,
+ d20_hi;
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ // [ left[0], above[-1], ... , above[5] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left);
+ // The last lane here is unused, reading left[8] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[7], x ]
+ l1 = vextq_u16(l0, l0, 1);
+ // [ above[-1], left[0], ... , left[6] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ // d0[0] = AVG2(above[-1], left[0])
+ // d0[1] = AVG2(left[0], left[1])
+ // ...
+ // d0[7] = AVG2(left[6], left[7])
+ d0 = vrhaddq_u16(azl0, l0);
+
+ // d1[0] = AVG3(left[0], above[-1], above[0])
+ // d1[1] = AVG3(above[-1], above[0], above[1])
+ // ...
+ // d1[7] = AVG3(above[5], above[6], above[7])
+ d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+ // d2[0] = AVG3(above[-1], left[0], left[1])
+ // d2[1] = AVG3(left[0], left[1], left[2])
+ // ...
+ // d2[7] = AVG3(left[6], left[7], left[8])
+ d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+
+ // The ext instruction shifts elements in from the end of the vector rather
+ // than the start, so reverse the vectors to put the elements to be shifted
+ // in at the end:
+ d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4));
+ d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4));
+
+ d20_lo = vzipq_u16(d2_rev, d0_rev).val[0];
+ d20_hi = vzipq_u16(d2_rev, d0_rev).val[1];
+
+ // Incrementally shift more elements from d0/d2 reversed into d1:
+ // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+ // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+ // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+ // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+ // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+ // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+ // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+ // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+ vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7));
+ vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5));
+ vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3));
+ vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1));
+ vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7));
+ vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5));
+ vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3));
+ vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+ uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2],
+ d2[2], d20[4];
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ a6 = vld1q_u16(above + 6);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l7 = vld1q_u16(left + 7);
+ l8 = vld1q_u16(left + 8);
+ // The last lane here is unused, reading left[16] could cause a buffer
+ // over-read, so just fill with a duplicate of left[8] to avoid needing to
+ // materialize a zero:
+ // [ left[9], ... , left[15], x ]
+ l9 = vextq_u16(l8, l8, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ d0[0] = vrhaddq_u16(azl0, l0);
+ d0[1] = vrhaddq_u16(l7, l8);
+ d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+ d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+ d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+ d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+ d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+ d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+ d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+ d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+
+ d20[0] = vzipq_u16(d2[1], d0[1]).val[0];
+ d20[1] = vzipq_u16(d2[1], d0[1]).val[1];
+ d20[2] = vzipq_u16(d2[0], d0[0]).val[0];
+ d20[3] = vzipq_u16(d2[0], d0[0]).val[1];
+
+ vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7));
+ vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5));
+ vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1));
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+}
+
+void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+ uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+ l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8];
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ a6 = vld1q_u16(above + 6);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ a14 = vld1q_u16(above + 14);
+ a15 = vld1q_u16(above + 15);
+ a16 = vld1q_u16(above + 16);
+ a22 = vld1q_u16(above + 22);
+ a23 = vld1q_u16(above + 23);
+ a24 = vld1q_u16(above + 24);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l7 = vld1q_u16(left + 7);
+ l8 = vld1q_u16(left + 8);
+ l9 = vld1q_u16(left + 9);
+ l15 = vld1q_u16(left + 15);
+ l16 = vld1q_u16(left + 16);
+ l17 = vld1q_u16(left + 17);
+ l23 = vld1q_u16(left + 23);
+ l24 = vld1q_u16(left + 24);
+ // The last lane here is unused, reading left[32] could cause a buffer
+ // over-read, so just fill with a duplicate of left[24] to avoid needing to
+ // materialize a zero:
+ // [ left[25], ... , left[31], x ]
+ l25 = vextq_u16(l24, l24, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ d0[0] = vrhaddq_u16(azl0, l0);
+ d0[1] = vrhaddq_u16(l7, l8);
+ d0[2] = vrhaddq_u16(l15, l16);
+ d0[3] = vrhaddq_u16(l23, l24);
+
+ d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+ d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+ d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+ d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+ d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+ d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+ d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+ d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+ d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+ d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+ d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4));
+ d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4));
+ d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+ d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+ d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4));
+ d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4));
+
+ d20[0] = vzipq_u16(d2[3], d0[3]).val[0];
+ d20[1] = vzipq_u16(d2[3], d0[3]).val[1];
+ d20[2] = vzipq_u16(d2[2], d0[2]).val[0];
+ d20[3] = vzipq_u16(d2[2], d0[2]).val[1];
+ d20[4] = vzipq_u16(d2[1], d0[1]).val[0];
+ d20[5] = vzipq_u16(d2[1], d0[1]).val[1];
+ d20[6] = vzipq_u16(d2[0], d0[0]).val[0];
+ d20[7] = vzipq_u16(d2[0], d0[0]).val[1];
+
+ vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7));
+ vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+ vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+ vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5));
+ vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+ vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+ vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+ vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7));
+ vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5));
+ vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3));
+ vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1));
+ vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7));
+ vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7));
+ vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5));
+ vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5));
+ vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3));
+ vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3));
+ vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1));
+ vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1));
+ vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1));
+
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7));
+ vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7));
+ vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5));
+ vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5));
+ vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5));
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3));
+ vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3));
+ vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3));
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1));
+ vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1));
+ vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1));
+
+ vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7));
+ vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7));
+ vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7));
+ vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7));
+ vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5));
+ vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5));
+ vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5));
+ vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5));
+ vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3));
+ vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3));
+ vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3));
+ vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3));
+ vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1));
+ vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1));
+ vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1));
+ vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1));
+
+ vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+ vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7));
+ vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7));
+ vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7));
+ vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+ vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5));
+ vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5));
+ vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5));
+ vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+ vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3));
+ vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3));
+ vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3));
+ vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+ vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1));
+ vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1));
+ vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1));
+
+ vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+ vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+ vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7));
+ vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7));
+ vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+ vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+ vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5));
+ vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5));
+ vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+ vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+ vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3));
+ vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3));
+ vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+ vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+ vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1));
+ vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1));
+
+ vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+ vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+ vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7));
+ vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7));
+ vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+ vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+ vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5));
+ vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5));
+ vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+ vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+ vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3));
+ vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3));
+ vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+ vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+ vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1));
+ vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t XA0123___ = vld1q_u16(above - 1);
+ const uint16x4_t L0123 = vld1_u16(left);
+ const uint16x4_t L3210 = vrev64_u16(L0123);
+ const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
+ const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
+ const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
+ const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
+ const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
+ const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
+ const uint16x4_t row_0 = vget_low_u16(avg2);
+ const uint16x4_t row_1 = vget_high_u16(avg2);
+ const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
+ const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
+ const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
+ (void)bd;
+ vst1_u16(dst, r0);
+ dst += stride;
+ vst1_u16(dst, r1);
+ dst += stride;
+ vst1_u16(dst, r2);
+ dst += stride;
+ vst1_u16(dst, row_0);
+}
+
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+ const uint16x8_t A01234567 = vld1q_u16(above);
+ const uint16x8_t A1234567_ = vld1q_u16(above + 1);
+ const uint16x8_t L01234567 = vld1q_u16(left);
+ const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+ const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+ const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+ const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+ const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
+ const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
+ const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
+ const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
+ const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
+ const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
+ const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
+ const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
+ const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
+ const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
+ const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
+ (void)bd;
+ vst1q_u16(dst, r0);
+ dst += stride;
+ vst1q_u16(dst, r1);
+ dst += stride;
+ vst1q_u16(dst, r2);
+ dst += stride;
+ vst1q_u16(dst, r3);
+ dst += stride;
+ vst1q_u16(dst, r4);
+ dst += stride;
+ vst1q_u16(dst, r5);
+ dst += stride;
+ vst1q_u16(dst, r6);
+ dst += stride;
+ vst1q_u16(dst, row_0);
+}
+
+static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row_0,
+ const uint16x8_t row_1) {
+ vst1q_u16(*dst, row_0);
+ *dst += 8;
+ vst1q_u16(*dst, row_1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t L01234567 = vld1q_u16(left);
+ const uint16x8_t L89abcdef = vld1q_u16(left + 8);
+ const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+ const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+ const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
+ const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
+ const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+ const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
+ const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
+ const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
+ const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
+
+ const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+ const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+ const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+ const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
+ const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
+
+ const uint16x8_t A01234567 = vld1q_u16(above);
+ const uint16x8_t A12345678 = vld1q_u16(above + 1);
+ const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
+ const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
+
+ const uint16x8_t A789abcde = vld1q_u16(above + 7);
+ const uint16x8_t A89abcdef = vld1q_u16(above + 8);
+ const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
+ const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
+ const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
+
+ const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
+ const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
+ const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
+ const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
+ const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
+ const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
+ const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
+ const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
+ const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
+ const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
+ const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
+ const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
+ const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
+ const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
+ const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
+ const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
+ const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
+ const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
+ const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
+ const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
+ const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
+ (void)bd;
+
+ d135_store_16(&dst, stride, r0_0, r0_1);
+ d135_store_16(&dst, stride, r1_0, r1_1);
+ d135_store_16(&dst, stride, r2_0, r2_1);
+ d135_store_16(&dst, stride, r3_0, r3_1);
+ d135_store_16(&dst, stride, r4_0, r4_1);
+ d135_store_16(&dst, stride, r5_0, r5_1);
+ d135_store_16(&dst, stride, r6_0, r6_1);
+ d135_store_16(&dst, stride, row_1, row_2);
+ d135_store_16(&dst, stride, r8_0, r0_0);
+ d135_store_16(&dst, stride, r9_0, r1_0);
+ d135_store_16(&dst, stride, ra_0, r2_0);
+ d135_store_16(&dst, stride, rb_0, r3_0);
+ d135_store_16(&dst, stride, rc_0, r4_0);
+ d135_store_16(&dst, stride, rd_0, r5_0);
+ d135_store_16(&dst, stride, re_0, r6_0);
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+}
+
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t LL01234567 = vld1q_u16(left + 16);
+ const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
+ const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
+ const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
+ const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
+ const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
+ const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
+ const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
+ const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
+ const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
+ uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
+
+ const uint16x8_t LU01234567 = vld1q_u16(left);
+ const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
+ const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
+ const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
+ const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
+ const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
+ const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
+ const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
+ const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
+ const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
+ const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
+ uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
+
+ const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
+ const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
+ const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
+ uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
+
+ const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
+ const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
+ const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
+ const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
+ uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
+
+ const uint16x8_t AL01234567 = vld1q_u16(above);
+ const uint16x8_t AL12345678 = vld1q_u16(above + 1);
+ const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
+ uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
+
+ const uint16x8_t AL789abcde = vld1q_u16(above + 7);
+ const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
+ const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
+ const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
+ uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
+
+ const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
+ const uint16x8_t AR01234567 = vld1q_u16(above + 16);
+ const uint16x8_t AR12345678 = vld1q_u16(above + 17);
+ const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
+ uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
+
+ const uint16x8_t AR789abcde = vld1q_u16(above + 23);
+ const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
+ const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
+ const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
+ uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
+ int i, j;
+ (void)bd;
+
+ dst += 31 * stride;
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 8; ++j) {
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+ dst += 8;
+ vst1q_u16(dst, row_2);
+ dst += 8;
+ vst1q_u16(dst, row_3);
+ dst -= stride + 24;
+ row_0 = vextq_u16(row_0, row_1, 1);
+ row_1 = vextq_u16(row_1, row_2, 1);
+ row_2 = vextq_u16(row_2, row_3, 1);
+ row_3 = vextq_u16(row_3, row_4, 1);
+ row_4 = vextq_u16(row_4, row_4, 1);
+ }
+ row_4 = row_5;
+ row_5 = row_6;
+ row_6 = row_7;
+ }
+}
+
+//------------------------------------------------------------------------------
+
+void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi;
+ (void)above;
+ (void)bd;
+
+ l0 = vld1_u16(left + 0);
+ l3 = vld1_dup_u16(left + 3);
+
+ // [ left[1], left[2], left[3], left[3] ]
+ l1 = vext_u16(l0, l3, 1);
+ // [ left[2], left[3], left[3], left[3] ]
+ l2 = vext_u16(l0, l3, 2);
+
+ c0 = vrhadd_u16(l0, l1);
+ c1 = vrhadd_u16(vhadd_u16(l0, l2), l1);
+
+ c01_lo = vzip_u16(c0, c1).val[0];
+ c01_hi = vzip_u16(c0, c1).val[1];
+
+ // stride=0 [ c0[0], c1[0], c0[1], c1[1] ]
+ // stride=1 [ c0[1], c1[1], c0[2], c1[2] ]
+ // stride=2 [ c0[2], c1[2], c0[3], c1[3] ]
+ // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+ vst1_u16(dst + 0 * stride, c01_lo);
+ vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2));
+ vst1_u16(dst + 2 * stride, c01_hi);
+ vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2));
+}
+
+void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi;
+ (void)above;
+ (void)bd;
+
+ l0 = vld1q_u16(left + 0);
+ l7 = vld1q_dup_u16(left + 7);
+
+ // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+ l1 = vextq_u16(l0, l7, 1);
+ // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+ l2 = vextq_u16(l0, l7, 2);
+
+ c0 = vrhaddq_u16(l0, l1);
+ c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+
+ c01_lo = vzipq_u16(c0, c1).val[0];
+ c01_hi = vzipq_u16(c0, c1).val[1];
+
+ vst1q_u16(dst + 0 * stride, c01_lo);
+ vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2));
+ vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4));
+ vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6));
+ vst1q_u16(dst + 4 * stride, c01_hi);
+ vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2));
+ vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4));
+ vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6));
+}
+
+void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4];
+ (void)above;
+ (void)bd;
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l2 = vld1q_u16(left + 2);
+ l8 = vld1q_u16(left + 8);
+ l15 = vld1q_dup_u16(left + 15);
+
+ l9 = vextq_u16(l8, l15, 1);
+ l10 = vextq_u16(l8, l15, 2);
+
+ c0[0] = vrhaddq_u16(l0, l1);
+ c0[1] = vrhaddq_u16(l8, l9);
+ c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+ c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+
+ c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+ c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+ c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+ c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+
+ vst1q_u16(dst + 0 * stride + 0, c01[0]);
+ vst1q_u16(dst + 0 * stride + 8, c01[1]);
+ vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+ vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+
+ vst1q_u16(dst + 4 * stride + 0, c01[1]);
+ vst1q_u16(dst + 4 * stride + 8, c01[2]);
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+
+ vst1q_u16(dst + 8 * stride + 0, c01[2]);
+ vst1q_u16(dst + 8 * stride + 8, c01[3]);
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2));
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6));
+
+ vst1q_u16(dst + 12 * stride + 0, c01[3]);
+ vst1q_u16(dst + 12 * stride + 8, l15);
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2));
+ vst1q_u16(dst + 13 * stride + 8, l15);
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4));
+ vst1q_u16(dst + 14 * stride + 8, l15);
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6));
+ vst1q_u16(dst + 15 * stride + 8, l15);
+}
+
+void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4],
+ c1[4], c01[8];
+ (void)above;
+ (void)bd;
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l2 = vld1q_u16(left + 2);
+ l8 = vld1q_u16(left + 8);
+ l9 = vld1q_u16(left + 9);
+ l10 = vld1q_u16(left + 10);
+ l16 = vld1q_u16(left + 16);
+ l17 = vld1q_u16(left + 17);
+ l18 = vld1q_u16(left + 18);
+ l24 = vld1q_u16(left + 24);
+ l31 = vld1q_dup_u16(left + 31);
+
+ l25 = vextq_u16(l24, l31, 1);
+ l26 = vextq_u16(l24, l31, 2);
+
+ c0[0] = vrhaddq_u16(l0, l1);
+ c0[1] = vrhaddq_u16(l8, l9);
+ c0[2] = vrhaddq_u16(l16, l17);
+ c0[3] = vrhaddq_u16(l24, l25);
+ c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+ c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+ c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17);
+ c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25);
+
+ c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+ c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+ c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+ c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+ c01[4] = vzipq_u16(c0[2], c1[2]).val[0];
+ c01[5] = vzipq_u16(c0[2], c1[2]).val[1];
+ c01[6] = vzipq_u16(c0[3], c1[3]).val[0];
+ c01[7] = vzipq_u16(c0[3], c1[3]).val[1];
+
+ vst1q_u16(dst + 0 * stride + 0, c01[0]);
+ vst1q_u16(dst + 0 * stride + 8, c01[1]);
+ vst1q_u16(dst + 0 * stride + 16, c01[2]);
+ vst1q_u16(dst + 0 * stride + 24, c01[3]);
+ vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+ vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+ vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2));
+ vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2));
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+ vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4));
+ vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+ vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6));
+ vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6));
+
+ vst1q_u16(dst + 4 * stride + 0, c01[1]);
+ vst1q_u16(dst + 4 * stride + 8, c01[2]);
+ vst1q_u16(dst + 4 * stride + 16, c01[3]);
+ vst1q_u16(dst + 4 * stride + 24, c01[4]);
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+ vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2));
+ vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+ vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4));
+ vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+ vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6));
+ vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6));
+
+ vst1q_u16(dst + 8 * stride + 0, c01[2]);
+ vst1q_u16(dst + 8 * stride + 8, c01[3]);
+ vst1q_u16(dst + 8 * stride + 16, c01[4]);
+ vst1q_u16(dst + 8 * stride + 24, c01[5]);
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2));
+ vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2));
+ vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2));
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4));
+ vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4));
+ vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6));
+ vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6));
+ vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6));
+
+ vst1q_u16(dst + 12 * stride + 0, c01[3]);
+ vst1q_u16(dst + 12 * stride + 8, c01[4]);
+ vst1q_u16(dst + 12 * stride + 16, c01[5]);
+ vst1q_u16(dst + 12 * stride + 24, c01[6]);
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2));
+ vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2));
+ vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2));
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4));
+ vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4));
+ vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4));
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6));
+ vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6));
+ vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6));
+
+ vst1q_u16(dst + 16 * stride + 0, c01[4]);
+ vst1q_u16(dst + 16 * stride + 8, c01[5]);
+ vst1q_u16(dst + 16 * stride + 16, c01[6]);
+ vst1q_u16(dst + 16 * stride + 24, c01[7]);
+ vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2));
+ vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2));
+ vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2));
+ vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2));
+ vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4));
+ vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4));
+ vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4));
+ vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4));
+ vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6));
+ vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6));
+ vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6));
+ vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6));
+
+ vst1q_u16(dst + 20 * stride + 0, c01[5]);
+ vst1q_u16(dst + 20 * stride + 8, c01[6]);
+ vst1q_u16(dst + 20 * stride + 16, c01[7]);
+ vst1q_u16(dst + 20 * stride + 24, l31);
+ vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2));
+ vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2));
+ vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2));
+ vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4));
+ vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4));
+ vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4));
+ vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6));
+ vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6));
+ vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6));
+ vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6));
+
+ vst1q_u16(dst + 24 * stride + 0, c01[6]);
+ vst1q_u16(dst + 24 * stride + 8, c01[7]);
+ vst1q_u16(dst + 24 * stride + 16, l31);
+ vst1q_u16(dst + 24 * stride + 24, l31);
+ vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2));
+ vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2));
+ vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4));
+ vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4));
+ vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6));
+ vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6));
+ vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6));
+ vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6));
+
+ vst1q_u16(dst + 28 * stride + 0, c01[7]);
+ vst1q_u16(dst + 28 * stride + 8, l31);
+ vst1q_u16(dst + 28 * stride + 16, l31);
+ vst1q_u16(dst + 28 * stride + 24, l31);
+ vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2));
+ vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4));
+ vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6));
+ vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6));
+ vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6));
+ vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6));
+}
+
+//------------------------------------------------------------------------------
+
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t row = vld1_u16(above);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, dst += stride) {
+ vst1_u16(dst, row);
+ }
+}
+
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t row = vld1q_u16(above);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 8; i++, dst += stride) {
+ vst1q_u16(dst, row);
+ }
+}
+
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t row0 = vld1q_u16(above + 0);
+ const uint16x8_t row1 = vld1q_u16(above + 8);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 16; i++) {
+ vst1q_u16(dst + 0, row0);
+ vst1q_u16(dst + 8, row1);
+ dst += stride;
+ }
+}
+
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t row0 = vld1q_u16(above + 0);
+ const uint16x8_t row1 = vld1q_u16(above + 8);
+ const uint16x8_t row2 = vld1q_u16(above + 16);
+ const uint16x8_t row3 = vld1q_u16(above + 24);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 32; i++) {
+ vst1q_u16(dst + 0, row0);
+ vst1q_u16(dst + 8, row1);
+ vst1q_u16(dst + 16, row2);
+ vst1q_u16(dst + 24, row3);
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t left_u16 = vld1_u16(left);
+ uint16x4_t row;
+ (void)above;
+ (void)bd;
+
+ row = vdup_lane_u16(left_u16, 0);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 1);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 2);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 3);
+ vst1_u16(dst, row);
+}
+
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t left_u16 = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16);
+ const uint16x4_t left_high = vget_high_u16(left_u16);
+ uint16x8_t row;
+ (void)above;
+ (void)bd;
+
+ row = vdupq_lane_u16(left_low, 0);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 1);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 2);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 3);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 0);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 1);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 2);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 3);
+ vst1q_u16(dst, row);
+}
+
+static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row) {
+ // Note: vst1q is faster than vst2q
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ const uint16x8_t left_u16q = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16q);
+ const uint16x4_t left_high = vget_high_u16(left_u16q);
+ uint16x8_t row;
+
+ row = vdupq_lane_u16(left_low, 0);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 1);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 2);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 3);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 0);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 1);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 2);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 3);
+ h_store_16(&dst, stride, row);
+ }
+}
+
+static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row) {
+ // Note: vst1q is faster than vst2q
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += stride - 24;
+}
+
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ const uint16x8_t left_u16q = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16q);
+ const uint16x4_t left_high = vget_high_u16(left_u16q);
+ uint16x8_t row;
+
+ row = vdupq_lane_u16(left_low, 0);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 1);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 2);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 3);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 0);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 1);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 2);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 3);
+ h_store_32(&dst, stride, row);
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
+ const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
+ const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
+ const int16x8_t sub = vsubq_s16(above_s16, top_left);
+ int16x8_t sum;
+ uint16x8_t row;
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+ sum = vaddq_s16(sum, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1_u16(dst, vget_low_u16(row));
+ dst += stride;
+ vst1_u16(dst, vget_high_u16(row));
+ dst += stride;
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+ sum = vaddq_s16(sum, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1_u16(dst, vget_low_u16(row));
+ dst += stride;
+ vst1_u16(dst, vget_high_u16(row));
+}
+
+static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub,
+ const int16x8_t max) {
+ uint16x8_t row;
+ int16x8_t sum = vaddq_s16(left_dup, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1q_u16(*dst, row);
+ *dst += stride;
+}
+
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
+ const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
+ const int16x8_t sub = vsubq_s16(above_s16, top_left);
+ int16x4_t left_s16d;
+ int16x8_t left_dup;
+ int i;
+
+ left_s16d = vget_low_s16(left_s16);
+
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+ }
+}
+
+static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1, const int16x8_t max) {
+ uint16x8_t row0, row1;
+ int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ sum0 = vminq_s16(sum0, max);
+ sum1 = vminq_s16(sum1, max);
+ row0 = vqshluq_n_s16(sum0, 0);
+ row1 = vqshluq_n_s16(sum1, 0);
+ vst1q_u16(*dst, row0);
+ *dst += 8;
+ vst1q_u16(*dst, row1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+ const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+ const int16x8_t sub0 = vsubq_s16(above0, top_left);
+ const int16x8_t sub1 = vsubq_s16(above1, top_left);
+ int16x8_t left_dup;
+ int i, j;
+
+ for (j = 0; j < 2; j++, left += 8) {
+ const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+ }
+ }
+}
+
+static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1, const int16x8_t sub2,
+ const int16x8_t sub3, const int16x8_t max) {
+ uint16x8_t row0, row1, row2, row3;
+ int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+ int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+ sum0 = vminq_s16(sum0, max);
+ sum1 = vminq_s16(sum1, max);
+ sum2 = vminq_s16(sum2, max);
+ sum3 = vminq_s16(sum3, max);
+ row0 = vqshluq_n_s16(sum0, 0);
+ row1 = vqshluq_n_s16(sum1, 0);
+ row2 = vqshluq_n_s16(sum2, 0);
+ row3 = vqshluq_n_s16(sum3, 0);
+ vst1q_u16(*dst, row0);
+ *dst += 8;
+ vst1q_u16(*dst, row1);
+ *dst += 8;
+ vst1q_u16(*dst, row2);
+ *dst += 8;
+ vst1q_u16(*dst, row3);
+ *dst += stride - 24;
+}
+
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+ const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+ const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
+ const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
+ const int16x8_t sub0 = vsubq_s16(above0, top_left);
+ const int16x8_t sub1 = vsubq_s16(above1, top_left);
+ const int16x8_t sub2 = vsubq_s16(above2, top_left);
+ const int16x8_t sub3 = vsubq_s16(above3, top_left);
+ int16x8_t left_dup;
+ int i, j;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c
new file mode 100644
index 0000000000..8d6e8acc4c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c
@@ -0,0 +1,776 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static INLINE void load_thresh(const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, uint16x8_t *blimit_vec,
+ uint16x8_t *limit_vec, uint16x8_t *thresh_vec,
+ const int bd) {
+ const int16x8_t shift = vdupq_n_s16(bd - 8);
+ *blimit_vec = vmovl_u8(vld1_dup_u8(blimit));
+ *limit_vec = vmovl_u8(vld1_dup_u8(limit));
+ *thresh_vec = vmovl_u8(vld1_dup_u8(thresh));
+ *blimit_vec = vshlq_u16(*blimit_vec, shift);
+ *limit_vec = vshlq_u16(*limit_vec, shift);
+ *thresh_vec = vshlq_u16(*thresh_vec, shift);
+}
+
+// Here flat is 128-bit long, with each 16-bit chunk being a mask of
+// a pixel. When used to control filter branches, we only detect whether it is
+// all 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
+// flat equals 0 if and only if flat_status equals 0.
+// flat equals -1 (all 1s) if and only if flat_status equals -4. (This is true
+// because each mask occupies more than 1 bit.)
+static INLINE uint32_t calc_flat_status(const uint16x8_t flat) {
+ const uint64x1_t t0 = vadd_u64(vreinterpret_u64_u16(vget_low_u16(flat)),
+ vreinterpret_u64_u16(vget_high_u16(flat)));
+ const uint64x1_t t1 = vpaddl_u32(vreinterpret_u32_u64(t0));
+ return vget_lane_u32(vreinterpret_u32_u64(t1), 0);
+}
+
+static INLINE uint16x8_t
+filter_hev_mask4(const uint16x8_t limit, const uint16x8_t blimit,
+ const uint16x8_t thresh, const uint16x8_t p3,
+ const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
+ const uint16x8_t q3, uint16x8_t *hev, uint16x8_t *mask) {
+ uint16x8_t max, t0, t1;
+
+ max = vabdq_u16(p1, p0);
+ max = vmaxq_u16(max, vabdq_u16(q1, q0));
+ *hev = vcgtq_u16(max, thresh);
+ *mask = vmaxq_u16(max, vabdq_u16(p3, p2));
+ *mask = vmaxq_u16(*mask, vabdq_u16(p2, p1));
+ *mask = vmaxq_u16(*mask, vabdq_u16(q2, q1));
+ *mask = vmaxq_u16(*mask, vabdq_u16(q3, q2));
+ t0 = vabdq_u16(p0, q0);
+ t1 = vabdq_u16(p1, q1);
+ t0 = vaddq_u16(t0, t0);
+ t1 = vshrq_n_u16(t1, 1);
+ t0 = vaddq_u16(t0, t1);
+ *mask = vcleq_u16(*mask, limit);
+ t0 = vcleq_u16(t0, blimit);
+ *mask = vandq_u16(*mask, t0);
+
+ return max;
+}
+
+static INLINE uint16x8_t filter_flat_hev_mask(
+ const uint16x8_t limit, const uint16x8_t blimit, const uint16x8_t thresh,
+ const uint16x8_t p3, const uint16x8_t p2, const uint16x8_t p1,
+ const uint16x8_t p0, const uint16x8_t q0, const uint16x8_t q1,
+ const uint16x8_t q2, const uint16x8_t q3, uint16x8_t *flat,
+ uint32_t *flat_status, uint16x8_t *hev, const int bd) {
+ uint16x8_t mask;
+ const uint16x8_t max = filter_hev_mask4(limit, blimit, thresh, p3, p2, p1, p0,
+ q0, q1, q2, q3, hev, &mask);
+ *flat = vmaxq_u16(max, vabdq_u16(p2, p0));
+ *flat = vmaxq_u16(*flat, vabdq_u16(q2, q0));
+ *flat = vmaxq_u16(*flat, vabdq_u16(p3, p0));
+ *flat = vmaxq_u16(*flat, vabdq_u16(q3, q0));
+ *flat = vcleq_u16(*flat, vdupq_n_u16(1 << (bd - 8))); /* flat_mask4() */
+ *flat = vandq_u16(*flat, mask);
+ *flat_status = calc_flat_status(*flat);
+
+ return mask;
+}
+
+static INLINE uint16x8_t flat_mask5(const uint16x8_t p4, const uint16x8_t p3,
+ const uint16x8_t p2, const uint16x8_t p1,
+ const uint16x8_t p0, const uint16x8_t q0,
+ const uint16x8_t q1, const uint16x8_t q2,
+ const uint16x8_t q3, const uint16x8_t q4,
+ const uint16x8_t flat,
+ uint32_t *flat2_status, const int bd) {
+ uint16x8_t flat2 = vabdq_u16(p4, p0);
+ flat2 = vmaxq_u16(flat2, vabdq_u16(p3, p0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(p2, p0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(p1, p0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(q1, q0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(q2, q0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(q3, q0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(q4, q0));
+ flat2 = vcleq_u16(flat2, vdupq_n_u16(1 << (bd - 8)));
+ flat2 = vandq_u16(flat2, flat);
+ *flat2_status = calc_flat_status(flat2);
+
+ return flat2;
+}
+
+static INLINE int16x8_t flip_sign(const uint16x8_t v, const int bd) {
+ const uint16x8_t offset = vdupq_n_u16(0x80 << (bd - 8));
+ return vreinterpretq_s16_u16(vsubq_u16(v, offset));
+}
+
+static INLINE uint16x8_t flip_sign_back(const int16x8_t v, const int bd) {
+ const int16x8_t offset = vdupq_n_s16(0x80 << (bd - 8));
+ return vreinterpretq_u16_s16(vaddq_s16(v, offset));
+}
+
+static INLINE void filter_update(const uint16x8_t sub0, const uint16x8_t sub1,
+ const uint16x8_t add0, const uint16x8_t add1,
+ uint16x8_t *sum) {
+ *sum = vsubq_u16(*sum, sub0);
+ *sum = vsubq_u16(*sum, sub1);
+ *sum = vaddq_u16(*sum, add0);
+ *sum = vaddq_u16(*sum, add1);
+}
+
+static INLINE uint16x8_t calc_7_tap_filter_kernel(const uint16x8_t sub0,
+ const uint16x8_t sub1,
+ const uint16x8_t add0,
+ const uint16x8_t add1,
+ uint16x8_t *sum) {
+ filter_update(sub0, sub1, add0, add1, sum);
+ return vrshrq_n_u16(*sum, 3);
+}
+
+static INLINE uint16x8_t apply_15_tap_filter_kernel(
+ const uint16x8_t flat, const uint16x8_t sub0, const uint16x8_t sub1,
+ const uint16x8_t add0, const uint16x8_t add1, const uint16x8_t in,
+ uint16x8_t *sum) {
+ filter_update(sub0, sub1, add0, add1, sum);
+ return vbslq_u16(flat, vrshrq_n_u16(*sum, 4), in);
+}
+
+// 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+static INLINE void calc_7_tap_filter(const uint16x8_t p3, const uint16x8_t p2,
+ const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1,
+ const uint16x8_t q2, const uint16x8_t q3,
+ uint16x8_t *op2, uint16x8_t *op1,
+ uint16x8_t *op0, uint16x8_t *oq0,
+ uint16x8_t *oq1, uint16x8_t *oq2) {
+ uint16x8_t sum;
+ sum = vaddq_u16(p3, p3); // 2*p3
+ sum = vaddq_u16(sum, p3); // 3*p3
+ sum = vaddq_u16(sum, p2); // 3*p3+p2
+ sum = vaddq_u16(sum, p2); // 3*p3+2*p2
+ sum = vaddq_u16(sum, p1); // 3*p3+2*p2+p1
+ sum = vaddq_u16(sum, p0); // 3*p3+2*p2+p1+p0
+ sum = vaddq_u16(sum, q0); // 3*p3+2*p2+p1+p0+q0
+ *op2 = vrshrq_n_u16(sum, 3);
+ *op1 = calc_7_tap_filter_kernel(p3, p2, p1, q1, &sum);
+ *op0 = calc_7_tap_filter_kernel(p3, p1, p0, q2, &sum);
+ *oq0 = calc_7_tap_filter_kernel(p3, p0, q0, q3, &sum);
+ *oq1 = calc_7_tap_filter_kernel(p2, q0, q1, q3, &sum);
+ *oq2 = calc_7_tap_filter_kernel(p1, q1, q2, q3, &sum);
+}
+
+static INLINE void apply_7_tap_filter(const uint16x8_t flat,
+ const uint16x8_t p3, const uint16x8_t p2,
+ const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1,
+ const uint16x8_t q2, const uint16x8_t q3,
+ uint16x8_t *op2, uint16x8_t *op1,
+ uint16x8_t *op0, uint16x8_t *oq0,
+ uint16x8_t *oq1, uint16x8_t *oq2) {
+ uint16x8_t tp1, tp0, tq0, tq1;
+ calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, &tq0, &tq1,
+ oq2);
+ *op2 = vbslq_u16(flat, *op2, p2);
+ *op1 = vbslq_u16(flat, tp1, *op1);
+ *op0 = vbslq_u16(flat, tp0, *op0);
+ *oq0 = vbslq_u16(flat, tq0, *oq0);
+ *oq1 = vbslq_u16(flat, tq1, *oq1);
+ *oq2 = vbslq_u16(flat, *oq2, q2);
+}
+
+// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+static INLINE void apply_15_tap_filter(
+ const uint16x8_t flat2, const uint16x8_t p7, const uint16x8_t p6,
+ const uint16x8_t p5, const uint16x8_t p4, const uint16x8_t p3,
+ const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
+ const uint16x8_t q3, const uint16x8_t q4, const uint16x8_t q5,
+ const uint16x8_t q6, const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5,
+ uint16x8_t *op4, uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1,
+ uint16x8_t *op0, uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
+ uint16x8_t *oq3, uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6) {
+ uint16x8_t sum;
+ sum = vshlq_n_u16(p7, 3); // 8*p7
+ sum = vsubq_u16(sum, p7); // 7*p7
+ sum = vaddq_u16(sum, p6); // 7*p7+p6
+ sum = vaddq_u16(sum, p6); // 7*p7+2*p6
+ sum = vaddq_u16(sum, p5); // 7*p7+2*p6+p5
+ sum = vaddq_u16(sum, p4); // 7*p7+2*p6+p5+p4
+ sum = vaddq_u16(sum, p3); // 7*p7+2*p6+p5+p4+p3
+ sum = vaddq_u16(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2
+ sum = vaddq_u16(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1
+ sum = vaddq_u16(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+ sum = vaddq_u16(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+ *op6 = vbslq_u16(flat2, vrshrq_n_u16(sum, 4), p6);
+ *op5 = apply_15_tap_filter_kernel(flat2, p7, p6, p5, q1, p5, &sum);
+ *op4 = apply_15_tap_filter_kernel(flat2, p7, p5, p4, q2, p4, &sum);
+ *op3 = apply_15_tap_filter_kernel(flat2, p7, p4, p3, q3, p3, &sum);
+ *op2 = apply_15_tap_filter_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
+ *op1 = apply_15_tap_filter_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
+ *op0 = apply_15_tap_filter_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
+ *oq0 = apply_15_tap_filter_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
+ *oq1 = apply_15_tap_filter_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
+ *oq2 = apply_15_tap_filter_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
+ *oq3 = apply_15_tap_filter_kernel(flat2, p4, q2, q3, q7, q3, &sum);
+ *oq4 = apply_15_tap_filter_kernel(flat2, p3, q3, q4, q7, q4, &sum);
+ *oq5 = apply_15_tap_filter_kernel(flat2, p2, q4, q5, q7, q5, &sum);
+ *oq6 = apply_15_tap_filter_kernel(flat2, p1, q5, q6, q7, q6, &sum);
+}
+
+static INLINE void filter4(const uint16x8_t mask, const uint16x8_t hev,
+ const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1,
+ uint16x8_t *op1, uint16x8_t *op0, uint16x8_t *oq0,
+ uint16x8_t *oq1, const int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << (bd - 1)) - 1);
+ const int16x8_t min = vdupq_n_s16((int16_t)(((uint32_t)-1) << (bd - 1)));
+ int16x8_t filter, filter1, filter2, t;
+ int16x8_t ps1 = flip_sign(p1, bd);
+ int16x8_t ps0 = flip_sign(p0, bd);
+ int16x8_t qs0 = flip_sign(q0, bd);
+ int16x8_t qs1 = flip_sign(q1, bd);
+
+ /* add outer taps if we have high edge variance */
+ filter = vsubq_s16(ps1, qs1);
+ filter = vmaxq_s16(filter, min);
+ filter = vminq_s16(filter, max);
+ filter = vandq_s16(filter, vreinterpretq_s16_u16(hev));
+ t = vsubq_s16(qs0, ps0);
+
+ /* inner taps */
+ filter = vaddq_s16(filter, t);
+ filter = vaddq_s16(filter, t);
+ filter = vaddq_s16(filter, t);
+ filter = vmaxq_s16(filter, min);
+ filter = vminq_s16(filter, max);
+ filter = vandq_s16(filter, vreinterpretq_s16_u16(mask));
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ /* if it equals 4 we'll set it to adjust by -1 to account for the fact */
+ /* we'd round it by 3 the other way */
+ t = vaddq_s16(filter, vdupq_n_s16(4));
+ t = vminq_s16(t, max);
+ filter1 = vshrq_n_s16(t, 3);
+ t = vaddq_s16(filter, vdupq_n_s16(3));
+ t = vminq_s16(t, max);
+ filter2 = vshrq_n_s16(t, 3);
+
+ qs0 = vsubq_s16(qs0, filter1);
+ qs0 = vmaxq_s16(qs0, min);
+ qs0 = vminq_s16(qs0, max);
+ ps0 = vaddq_s16(ps0, filter2);
+ ps0 = vmaxq_s16(ps0, min);
+ ps0 = vminq_s16(ps0, max);
+ *oq0 = flip_sign_back(qs0, bd);
+ *op0 = flip_sign_back(ps0, bd);
+
+ /* outer tap adjustments */
+ filter = vrshrq_n_s16(filter1, 1);
+ filter = vbicq_s16(filter, vreinterpretq_s16_u16(hev));
+
+ qs1 = vsubq_s16(qs1, filter);
+ qs1 = vmaxq_s16(qs1, min);
+ qs1 = vminq_s16(qs1, max);
+ ps1 = vaddq_s16(ps1, filter);
+ ps1 = vmaxq_s16(ps1, min);
+ ps1 = vminq_s16(ps1, max);
+ *oq1 = flip_sign_back(qs1, bd);
+ *op1 = flip_sign_back(ps1, bd);
+}
+
+static INLINE void filter8(const uint16x8_t mask, const uint16x8_t flat,
+ const uint32_t flat_status, const uint16x8_t hev,
+ const uint16x8_t p3, const uint16x8_t p2,
+ const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1,
+ const uint16x8_t q2, const uint16x8_t q3,
+ uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
+ uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
+ const int bd) {
+ if (flat_status != (uint32_t)-4) {
+ filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
+ *op2 = p2;
+ *oq2 = q2;
+ if (flat_status) {
+ apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
+ oq0, oq1, oq2);
+ }
+ } else {
+ calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, oq0, oq1,
+ oq2);
+ }
+}
+
+static INLINE void filter16(
+ const uint16x8_t mask, const uint16x8_t flat, const uint32_t flat_status,
+ const uint16x8_t flat2, const uint32_t flat2_status, const uint16x8_t hev,
+ const uint16x8_t p7, const uint16x8_t p6, const uint16x8_t p5,
+ const uint16x8_t p4, const uint16x8_t p3, const uint16x8_t p2,
+ const uint16x8_t p1, const uint16x8_t p0, const uint16x8_t q0,
+ const uint16x8_t q1, const uint16x8_t q2, const uint16x8_t q3,
+ const uint16x8_t q4, const uint16x8_t q5, const uint16x8_t q6,
+ const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, uint16x8_t *op4,
+ uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
+ uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, uint16x8_t *oq3,
+ uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6, const int bd) {
+ if (flat_status != (uint32_t)-4) {
+ filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
+ }
+
+ if (flat_status) {
+ *op2 = p2;
+ *oq2 = q2;
+ if (flat2_status != (uint32_t)-4) {
+ apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
+ oq0, oq1, oq2);
+ }
+ if (flat2_status) {
+ apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3,
+ q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0,
+ oq0, oq1, oq2, oq3, oq4, oq5, oq6);
+ }
+ }
+}
+
+static INLINE void load_8x8(const uint16_t *s, const int p, uint16x8_t *p3,
+ uint16x8_t *p2, uint16x8_t *p1, uint16x8_t *p0,
+ uint16x8_t *q0, uint16x8_t *q1, uint16x8_t *q2,
+ uint16x8_t *q3) {
+ *p3 = vld1q_u16(s);
+ s += p;
+ *p2 = vld1q_u16(s);
+ s += p;
+ *p1 = vld1q_u16(s);
+ s += p;
+ *p0 = vld1q_u16(s);
+ s += p;
+ *q0 = vld1q_u16(s);
+ s += p;
+ *q1 = vld1q_u16(s);
+ s += p;
+ *q2 = vld1q_u16(s);
+ s += p;
+ *q3 = vld1q_u16(s);
+}
+
+static INLINE void load_8x16(const uint16_t *s, const int p, uint16x8_t *s0,
+ uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3,
+ uint16x8_t *s4, uint16x8_t *s5, uint16x8_t *s6,
+ uint16x8_t *s7, uint16x8_t *s8, uint16x8_t *s9,
+ uint16x8_t *s10, uint16x8_t *s11, uint16x8_t *s12,
+ uint16x8_t *s13, uint16x8_t *s14,
+ uint16x8_t *s15) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+ *s5 = vld1q_u16(s);
+ s += p;
+ *s6 = vld1q_u16(s);
+ s += p;
+ *s7 = vld1q_u16(s);
+ s += p;
+ *s8 = vld1q_u16(s);
+ s += p;
+ *s9 = vld1q_u16(s);
+ s += p;
+ *s10 = vld1q_u16(s);
+ s += p;
+ *s11 = vld1q_u16(s);
+ s += p;
+ *s12 = vld1q_u16(s);
+ s += p;
+ *s13 = vld1q_u16(s);
+ s += p;
+ *s14 = vld1q_u16(s);
+ s += p;
+ *s15 = vld1q_u16(s);
+}
+
+static INLINE void store_8x4(uint16_t *s, const int p, const uint16x8_t s0,
+ const uint16x8_t s1, const uint16x8_t s2,
+ const uint16x8_t s3) {
+ vst1q_u16(s, s0);
+ s += p;
+ vst1q_u16(s, s1);
+ s += p;
+ vst1q_u16(s, s2);
+ s += p;
+ vst1q_u16(s, s3);
+}
+
+static INLINE void store_8x6(uint16_t *s, const int p, const uint16x8_t s0,
+ const uint16x8_t s1, const uint16x8_t s2,
+ const uint16x8_t s3, const uint16x8_t s4,
+ const uint16x8_t s5) {
+ vst1q_u16(s, s0);
+ s += p;
+ vst1q_u16(s, s1);
+ s += p;
+ vst1q_u16(s, s2);
+ s += p;
+ vst1q_u16(s, s3);
+ s += p;
+ vst1q_u16(s, s4);
+ s += p;
+ vst1q_u16(s, s5);
+}
+
+static INLINE void store_4x8(uint16_t *s, const int p, const uint16x8_t p1,
+ const uint16x8_t p0, const uint16x8_t q0,
+ const uint16x8_t q1) {
+ uint16x8x4_t o;
+
+ o.val[0] = p1;
+ o.val[1] = p0;
+ o.val[2] = q0;
+ o.val[3] = q1;
+ vst4q_lane_u16(s, o, 0);
+ s += p;
+ vst4q_lane_u16(s, o, 1);
+ s += p;
+ vst4q_lane_u16(s, o, 2);
+ s += p;
+ vst4q_lane_u16(s, o, 3);
+ s += p;
+ vst4q_lane_u16(s, o, 4);
+ s += p;
+ vst4q_lane_u16(s, o, 5);
+ s += p;
+ vst4q_lane_u16(s, o, 6);
+ s += p;
+ vst4q_lane_u16(s, o, 7);
+}
+
+static INLINE void store_6x8(uint16_t *s, const int p, const uint16x8_t s0,
+ const uint16x8_t s1, const uint16x8_t s2,
+ const uint16x8_t s3, const uint16x8_t s4,
+ const uint16x8_t s5) {
+ uint16x8x3_t o0, o1;
+
+ o0.val[0] = s0;
+ o0.val[1] = s1;
+ o0.val[2] = s2;
+ o1.val[0] = s3;
+ o1.val[1] = s4;
+ o1.val[2] = s5;
+ vst3q_lane_u16(s - 3, o0, 0);
+ vst3q_lane_u16(s + 0, o1, 0);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 1);
+ vst3q_lane_u16(s + 0, o1, 1);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 2);
+ vst3q_lane_u16(s + 0, o1, 2);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 3);
+ vst3q_lane_u16(s + 0, o1, 3);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 4);
+ vst3q_lane_u16(s + 0, o1, 4);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 5);
+ vst3q_lane_u16(s + 0, o1, 5);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 6);
+ vst3q_lane_u16(s + 0, o1, 6);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 7);
+ vst3q_lane_u16(s + 0, o1, 7);
+}
+
+static INLINE void store_7x8(uint16_t *s, const int p, const uint16x8_t s0,
+ const uint16x8_t s1, const uint16x8_t s2,
+ const uint16x8_t s3, const uint16x8_t s4,
+ const uint16x8_t s5, const uint16x8_t s6) {
+ uint16x8x4_t o0;
+ uint16x8x3_t o1;
+
+ o0.val[0] = s0;
+ o0.val[1] = s1;
+ o0.val[2] = s2;
+ o0.val[3] = s3;
+ o1.val[0] = s4;
+ o1.val[1] = s5;
+ o1.val[2] = s6;
+ vst4q_lane_u16(s - 4, o0, 0);
+ vst3q_lane_u16(s + 0, o1, 0);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 1);
+ vst3q_lane_u16(s + 0, o1, 1);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 2);
+ vst3q_lane_u16(s + 0, o1, 2);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 3);
+ vst3q_lane_u16(s + 0, o1, 3);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 4);
+ vst3q_lane_u16(s + 0, o1, 4);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 5);
+ vst3q_lane_u16(s + 0, o1, 5);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 6);
+ vst3q_lane_u16(s + 0, o1, 6);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 7);
+ vst3q_lane_u16(s + 0, o1, 7);
+}
+
+static INLINE void store_8x14(uint16_t *s, const int p, const uint16x8_t p6,
+ const uint16x8_t p5, const uint16x8_t p4,
+ const uint16x8_t p3, const uint16x8_t p2,
+ const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1,
+ const uint16x8_t q2, const uint16x8_t q3,
+ const uint16x8_t q4, const uint16x8_t q5,
+ const uint16x8_t q6, const uint32_t flat_status,
+ const uint32_t flat2_status) {
+ if (flat_status) {
+ if (flat2_status) {
+ vst1q_u16(s - 7 * p, p6);
+ vst1q_u16(s - 6 * p, p5);
+ vst1q_u16(s - 5 * p, p4);
+ vst1q_u16(s - 4 * p, p3);
+ vst1q_u16(s + 3 * p, q3);
+ vst1q_u16(s + 4 * p, q4);
+ vst1q_u16(s + 5 * p, q5);
+ vst1q_u16(s + 6 * p, q6);
+ }
+ vst1q_u16(s - 3 * p, p2);
+ vst1q_u16(s + 2 * p, q2);
+ }
+ vst1q_u16(s - 2 * p, p1);
+ vst1q_u16(s - 1 * p, p0);
+ vst1q_u16(s + 0 * p, q0);
+ vst1q_u16(s + 1 * p, q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ mask, hev;
+
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+ q2, q3, &hev, &mask);
+ filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
+ store_8x4(s - 2 * p, p, p1, p0, q0, q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_neon(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_4_neon(s, p, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_4_neon(s + 8, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ mask, hev;
+
+ load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
+ (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
+ (int16x8_t *)&q2, (int16x8_t *)&q3);
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+ q2, q3, &hev, &mask);
+ filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
+ store_4x8(s - 2, p, p1, p0, q0, q1);
+}
+
+void vpx_highbd_lpf_vertical_4_dual_neon(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+ q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+ filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2, bd);
+ store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_highbd_lpf_horizontal_8_dual_neon(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
+ (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
+ (int16x8_t *)&q2, (int16x8_t *)&q3);
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+ q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+ filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2, bd);
+ // Note: store_6x8() is faster than transpose + store_8x8().
+ store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_highbd_lpf_vertical_8_dual_neon(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
+}
+
+// Quiet warnings of the form: 'vpx_dsp/arm/highbd_loopfilter_neon.c|675 col 67|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
+static void lpf_horizontal_16_kernel(uint16_t *s, int p,
+ const uint16x8_t blimit_vec,
+ const uint16x8_t limit_vec,
+ const uint16x8_t thresh_vec,
+ const int bd) {
+ uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
+ q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+ oq4, oq5, oq6;
+ uint32_t flat_status, flat2_status;
+
+ load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
+ &q3, &q4, &q5, &q6, &q7);
+ mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+ q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+ flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
+ &flat2_status, bd);
+ filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
+ p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
+ &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+ bd);
+ store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
+ oq5, oq6, flat_status, flat2_status);
+}
+
+static void lpf_vertical_16_kernel(uint16_t *s, int p,
+ const uint16x8_t blimit_vec,
+ const uint16x8_t limit_vec,
+ const uint16x8_t thresh_vec, const int bd) {
+ uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
+ q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+ oq4, oq5, oq6;
+ uint32_t flat_status, flat2_status;
+
+ load_8x8(s - 8, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
+ transpose_s16_8x8((int16x8_t *)&p7, (int16x8_t *)&p6, (int16x8_t *)&p5,
+ (int16x8_t *)&p4, (int16x8_t *)&p3, (int16x8_t *)&p2,
+ (int16x8_t *)&p1, (int16x8_t *)&p0);
+ load_8x8(s, p, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ transpose_s16_8x8((int16x8_t *)&q0, (int16x8_t *)&q1, (int16x8_t *)&q2,
+ (int16x8_t *)&q3, (int16x8_t *)&q4, (int16x8_t *)&q5,
+ (int16x8_t *)&q6, (int16x8_t *)&q7);
+ mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+ q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+ flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
+ &flat2_status, bd);
+ filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
+ p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
+ &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+ bd);
+ if (flat_status) {
+ if (flat2_status) {
+ store_7x8(s - 3, p, op6, op5, op4, op3, op2, op1, op0);
+ store_7x8(s + 4, p, oq0, oq1, oq2, oq3, oq4, oq5, oq6);
+ } else {
+ // Note: store_6x8() is faster than transpose + store_8x8().
+ store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
+ }
+ } else {
+ store_4x8(s - 2, p, op1, op0, oq0, oq1);
+ }
+}
+
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+ lpf_horizontal_16_kernel(s + 8, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+ lpf_vertical_16_kernel(s + 8 * p, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 0000000000..d2a7add60d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
+ const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
+ tran_low_t *dqcoeff_ptr) {
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_8_neon(
+ const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin,
+ const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift,
+ int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) {
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31);
+ const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31);
+ const int32x4_t coeff_0_abs = vabsq_s32(coeff_0);
+ const int32x4_t coeff_1_abs = vabsq_s32(coeff_1);
+
+ // Calculate 2 masks of elements outside the bin
+ const int32x4_t zbin_mask_0 =
+ vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin));
+ const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32(
+ vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1)));
+
+ // Get the rounded values
+ const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round);
+ const int32x4_t rounded_1 =
+ vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1));
+
+ // (round * (quant << 15) * 2) >> 16 == (round * quant)
+ int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant);
+ int32x4_t qcoeff_tmp_1 =
+ vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1));
+
+ // Add rounded values
+ qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0);
+ qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1);
+
+ // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift)
+ qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift);
+ qcoeff_tmp_1 =
+ vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1));
+
+ // Restore the sign bit.
+ qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign);
+ qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign);
+ qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign);
+ qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign);
+
+ // Only keep the relevant coeffs
+ *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0);
+ *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t
+highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int32x4_t zbin,
+ const int32x4_t round, const int32x4_t quant,
+ const int32x4_t quant_shift, const int32x4_t dequant) {
+ int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+ const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+ highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+ &qcoeff_0, &qcoeff_1);
+
+ // Store the 32-bit qcoeffs
+ vst1q_s32(qcoeff_ptr, qcoeff_0);
+ vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+ // Calculate and store the dqcoeffs
+ dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+ dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+ highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+ return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+
+ // Only the first element of each vector is DC.
+ // High half has identical elements, but we can reconstruct it from the low
+ // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+ // vector
+ int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr));
+ int32x4_t round = vmovl_s16(vld1_s16(round_ptr));
+ // Extend the quant, quant_shift vectors to ones of 32-bit elements
+ // scale to high-half, so we can use vqdmulhq_s32
+ int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
+ int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15);
+ int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ n_coeffs -= 8;
+
+ {
+ zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+ round = vdupq_lane_s32(vget_low_s32(round), 1);
+ quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+ quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+ dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+ do {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ n_coeffs -= 8;
+ } while (n_coeffs > 0);
+ }
+
+#if VPX_ARCH_AARCH64
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // VPX_ARCH_AARCH64
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)n_coeffs;
+ (void)scan;
+}
+
+static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32(
+ int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) {
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+ dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+ dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round,
+ const int32x4_t quant, const int32x4_t quant_shift,
+ const int32x4_t dequant) {
+ int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+ const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+ highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+ &qcoeff_0, &qcoeff_1);
+
+ // Store the 32-bit qcoeffs
+ vst1q_s32(qcoeff_ptr, qcoeff_0);
+ vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+ // Calculate and store the dqcoeffs
+ dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+ dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+ highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+ return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_32x32_neon(
+ const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+ int i;
+ const int16_t *iscan = scan_order->iscan;
+
+ // Only the first element of each vector is DC.
+ // High half has identical elements, but we can reconstruct it from the low
+ // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+ // vector
+ int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1);
+ int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1);
+ // Extend the quant, quant_shift vectors to ones of 32-bit elements
+ // scale to high-half, so we can use vqdmulhq_s32
+ int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
+ int32x4_t quant_shift =
+ vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16);
+ int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ {
+ zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+ round = vdupq_lane_s32(vget_low_s32(round), 1);
+ quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+ quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+ dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+ for (i = 1; i < 32 * 32 / 8; ++i) {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+ }
+
+#if VPX_ARCH_AARCH64
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // VPX_ARCH_AARCH64
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c
new file mode 100644
index 0000000000..a6684b0534
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+ uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+ uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+ uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+ uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+ sum[0] = vabal_u16(sum[0], s, r0);
+ sum[1] = vabal_u16(sum[1], s, r1);
+ sum[2] = vabal_u16(sum[2], s, r2);
+ sum[3] = vabal_u16(sum[3], s, r3);
+
+ } while (++i < h);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32[4];
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+ sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+ sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+ sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+ sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
+
+ } while (++i < h);
+
+ sum_u32[0] = vpaddlq_u16(sum[0]);
+ sum_u32[1] = vpaddlq_u16(sum[1]);
+ sum_u32[2] = vpaddlq_u16(sum[2]);
+ sum_u32[3] = vpaddlq_u16(sum[3]);
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+ uint32x4_t *const sad_sum) {
+ uint16x8_t abs_diff = vabdq_u16(src, ref);
+ *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ uint16x8_t s0, s1;
+
+ s0 = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+ s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4], int w,
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0, s1, s2, s3;
+
+ s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+ s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+ s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+ sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+ &sum_lo[0]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+ &sum_lo[1]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+ &sum_lo[2]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+ &sum_lo[3]);
+
+ s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+ sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+ &sum_hi[0]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+ &sum_hi[1]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+ &sum_hi[2]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+ &sum_hi[3]);
+
+ j += 32;
+ } while (j < w);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h);
+}
+
+static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h);
+}
+
+#define HBD_SAD_WXH_4D_NEON(w, h) \
+ void vpx_highbd_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+HBD_SAD_WXH_4D_NEON(4, 4)
+HBD_SAD_WXH_4D_NEON(4, 8)
+
+HBD_SAD_WXH_4D_NEON(8, 4)
+HBD_SAD_WXH_4D_NEON(8, 8)
+HBD_SAD_WXH_4D_NEON(8, 16)
+
+HBD_SAD_WXH_4D_NEON(16, 8)
+HBD_SAD_WXH_4D_NEON(16, 16)
+HBD_SAD_WXH_4D_NEON(16, 32)
+
+HBD_SAD_WXH_4D_NEON(32, 16)
+HBD_SAD_WXH_4D_NEON(32, 32)
+HBD_SAD_WXH_4D_NEON(32, 64)
+
+HBD_SAD_WXH_4D_NEON(64, 32)
+HBD_SAD_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_WXH_4D_NEON
+
+#define HBD_SAD_SKIP_WXH_4D_NEON(w, h) \
+ void vpx_highbd_sad_skip_##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, ((h) >> 1)); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+HBD_SAD_SKIP_WXH_4D_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_4D_NEON
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c
new file mode 100644
index 0000000000..b99bac66cd
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr);
+ uint16x4_t r = vld1_u16(ref16_ptr);
+ sum = vabal_u16(sum, s, r);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ sum = vabaq_u16(sum, s, r);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint16x8_t s0, s1, r0, r1;
+ uint16x8_t diff0, diff1;
+
+ s0 = vld1q_u16(src16_ptr);
+ r0 = vld1q_u16(ref16_ptr);
+ diff0 = vabdq_u16(s0, r0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + 8);
+ r1 = vld1q_u16(ref16_ptr + 8);
+ diff1 = vabdq_u16(s1, r1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3;
+ uint16x8_t diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u16(src16_ptr + j);
+ r0 = vld1q_u16(ref16_ptr + j);
+ diff0 = vabdq_u16(s0, r0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + j + 8);
+ r1 = vld1q_u16(ref16_ptr + j + 8);
+ diff1 = vabdq_u16(s1, r1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ s2 = vld1q_u16(src16_ptr + j + 16);
+ r2 = vld1q_u16(ref16_ptr + j + 16);
+ diff2 = vabdq_u16(s2, r2);
+ sum[2] = vpadalq_u16(sum[2], diff2);
+
+ s3 = vld1q_u16(src16_ptr + j + 24);
+ r3 = vld1q_u16(ref16_ptr + j + 24);
+ diff3 = vabdq_u16(s3, r3);
+ sum[3] = vpadalq_u16(sum[3], diff3);
+
+ j += 32;
+ } while (j < w);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+
+ return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+#define HBD_SAD_WXH_NEON(w, h) \
+ unsigned int vpx_highbd_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
+ }
+
+HBD_SAD_WXH_NEON(4, 4)
+HBD_SAD_WXH_NEON(4, 8)
+
+HBD_SAD_WXH_NEON(8, 4)
+HBD_SAD_WXH_NEON(8, 8)
+HBD_SAD_WXH_NEON(8, 16)
+
+HBD_SAD_WXH_NEON(16, 8)
+HBD_SAD_WXH_NEON(16, 16)
+HBD_SAD_WXH_NEON(16, 32)
+
+HBD_SAD_WXH_NEON(32, 16)
+HBD_SAD_WXH_NEON(32, 32)
+HBD_SAD_WXH_NEON(32, 64)
+
+HBD_SAD_WXH_NEON(64, 32)
+HBD_SAD_WXH_NEON(64, 64)
+
+#undef HBD_SAD_WXH_NEON
+
+#define HBD_SAD_SKIP_WXH_NEON(w, h) \
+ unsigned int vpx_highbd_sad_skip_##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad##w##xh_neon(src, 2 * src_stride, ref, \
+ 2 * ref_stride, (h) / 2); \
+ }
+
+HBD_SAD_SKIP_WXH_NEON(4, 4)
+HBD_SAD_SKIP_WXH_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_NEON(8, 4)
+HBD_SAD_SKIP_WXH_NEON(8, 8)
+HBD_SAD_SKIP_WXH_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_NEON(16, 8)
+HBD_SAD_SKIP_WXH_NEON(16, 16)
+HBD_SAD_SKIP_WXH_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_NEON(32, 16)
+HBD_SAD_SKIP_WXH_NEON(32, 32)
+HBD_SAD_SKIP_WXH_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_NEON(64, 32)
+HBD_SAD_SKIP_WXH_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_NEON
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr);
+ uint16x4_t r = vld1_u16(ref16_ptr);
+ uint16x4_t p = vld1_u16(pred16_ptr);
+
+ uint16x4_t avg = vrhadd_u16(r, p);
+ sum = vabal_u16(sum, s, avg);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 4;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ uint16x8_t p = vld1q_u16(pred16_ptr);
+
+ uint16x8_t avg = vrhaddq_u16(r, p);
+ uint16x8_t diff = vabdq_u16(s, avg);
+ sum = vpadalq_u16(sum, diff);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 8;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint16x8_t s0, s1, r0, r1, p0, p1;
+ uint16x8_t avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u16(src16_ptr);
+ r0 = vld1q_u16(ref16_ptr);
+ p0 = vld1q_u16(pred16_ptr);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + 8);
+ r1 = vld1q_u16(ref16_ptr + 8);
+ p1 = vld1q_u16(pred16_ptr + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 16;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+ uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u16(src16_ptr + j);
+ r0 = vld1q_u16(ref16_ptr + j);
+ p0 = vld1q_u16(pred16_ptr + j);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + j + 8);
+ r1 = vld1q_u16(ref16_ptr + j + 8);
+ p1 = vld1q_u16(pred16_ptr + j + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ s2 = vld1q_u16(src16_ptr + j + 16);
+ r2 = vld1q_u16(ref16_ptr + j + 16);
+ p2 = vld1q_u16(pred16_ptr + j + 16);
+ avg2 = vrhaddq_u16(r2, p2);
+ diff2 = vabdq_u16(s2, avg2);
+ sum[2] = vpadalq_u16(sum[2], diff2);
+
+ s3 = vld1q_u16(src16_ptr + j + 24);
+ r3 = vld1q_u16(ref16_ptr + j + 24);
+ p3 = vld1q_u16(pred16_ptr + j + 24);
+ avg3 = vrhaddq_u16(r3, p3);
+ diff3 = vabdq_u16(s3, avg3);
+ sum[3] = vpadalq_u16(sum[3], diff3);
+
+ j += 32;
+ } while (j < w);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += w;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+
+ return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+ second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+ second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h) \
+ uint32_t vpx_highbd_sad##w##x##h##_avg_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 0000000000..683df5797a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, blend);
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+ uint16_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height,
+ int filter_offset) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, blend);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 8, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 16, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 32, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ \
+ return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+ w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+ h); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
+ src_stride, h, yoffset); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
+ xoffset); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
+ uint16x4_t p = vld1_u16(second_pred);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ second_pred += 4;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint16_t *second_pred) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 8, dst_height,
+ filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+ uint16x8_t p = vld1q_u16(second_pred);
+ avg = vrhaddq_u16(avg, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+ int src_stride, int dst_width, int dst_height,
+ const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t avg = vrhaddq_u16(s, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ uint16_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp, source_stride, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp, source_stride, source_stride, h, yoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp0, source_stride, 1, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp0, source_stride, 1, h, xoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 0000000000..75fde676a0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+
+ int i = h;
+ do {
+ const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride);
+ const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_int16x8(sum_s16);
+ *sse = horizontal_add_int32x4(sse_s32);
+}
+
+// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all
+// block sizes can be processed in 32-bit elements (1023*1023*64*16 = 1071645696
+// for a 64x64 block).
+static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t s = vld1q_u16(src_ptr + j);
+ const uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s32 = vpadalq_s16(sum_s32, diff);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = horizontal_long_add_uint32x4(vaddq_u32(
+ vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1])));
+}
+
+static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum);
+}
+
+static INLINE void highbd_variance_16xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum);
+}
+
+static INLINE void highbd_variance_32xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+// For 12-bit data, we can only accumulate up to 128 elements in the sum of
+// squares (4095*4095*128 = 2146435200), and because we're using two int32x4
+// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128)
+// or 16 64-element rows before we have to accumulate into 64-bit elements.
+// Therefore blocks of size 32x64, 64x32 and 64x64 are processed in a different
+// helper function.
+
+// Process a block of any size where the width is divisible by 8, with
+// accumulation into 64-bit elements.
+static INLINE void highbd_variance_xlarge_neon(
+ const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
+ int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+ // accumulator overflows. After hitting this limit we accumulate into 64-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ int i = 0;
+ do {
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ const uint16x8_t r0 = vld1q_u16(ref_ptr + j);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ sum_s32 = vpadalq_s16(sum_s32, diff);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ i++;
+ } while (i < h_tmp);
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]);
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]);
+ h_tmp += h_limit;
+ } while (i < h);
+
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = (uint64_t)horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_32xh_xlarge_neon(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse, int64_t *sum) {
+ highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse,
+ sum);
+}
+
+static INLINE void highbd_variance_64xh_xlarge_neon(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse, int64_t *sum) {
+ highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse,
+ sum);
+}
+
+#define HBD_VARIANCE_WXH_8_NEON(w, h) \
+ uint32_t vpx_highbd_8_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ sum = (int)sum_long; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+#define HBD_VARIANCE_WXH_10_NEON(w, h) \
+ uint32_t vpx_highbd_10_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HBD_VARIANCE_WXH_12_NEON(w, h) \
+ uint32_t vpx_highbd_12_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h) \
+ uint32_t vpx_highbd_12_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 4)
+HBD_VARIANCE_WXH_8_NEON(4, 8)
+
+HBD_VARIANCE_WXH_8_NEON(8, 4)
+HBD_VARIANCE_WXH_8_NEON(8, 8)
+HBD_VARIANCE_WXH_8_NEON(8, 16)
+
+HBD_VARIANCE_WXH_8_NEON(16, 8)
+HBD_VARIANCE_WXH_8_NEON(16, 16)
+HBD_VARIANCE_WXH_8_NEON(16, 32)
+
+HBD_VARIANCE_WXH_8_NEON(32, 16)
+HBD_VARIANCE_WXH_8_NEON(32, 32)
+HBD_VARIANCE_WXH_8_NEON(32, 64)
+
+HBD_VARIANCE_WXH_8_NEON(64, 32)
+HBD_VARIANCE_WXH_8_NEON(64, 64)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 4)
+HBD_VARIANCE_WXH_10_NEON(4, 8)
+
+HBD_VARIANCE_WXH_10_NEON(8, 4)
+HBD_VARIANCE_WXH_10_NEON(8, 8)
+HBD_VARIANCE_WXH_10_NEON(8, 16)
+
+HBD_VARIANCE_WXH_10_NEON(16, 8)
+HBD_VARIANCE_WXH_10_NEON(16, 16)
+HBD_VARIANCE_WXH_10_NEON(16, 32)
+
+HBD_VARIANCE_WXH_10_NEON(32, 16)
+HBD_VARIANCE_WXH_10_NEON(32, 32)
+HBD_VARIANCE_WXH_10_NEON(32, 64)
+
+HBD_VARIANCE_WXH_10_NEON(64, 32)
+HBD_VARIANCE_WXH_10_NEON(64, 64)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 4)
+HBD_VARIANCE_WXH_12_NEON(4, 8)
+
+HBD_VARIANCE_WXH_12_NEON(8, 4)
+HBD_VARIANCE_WXH_12_NEON(8, 8)
+HBD_VARIANCE_WXH_12_NEON(8, 16)
+
+HBD_VARIANCE_WXH_12_NEON(16, 8)
+HBD_VARIANCE_WXH_12_NEON(16, 16)
+HBD_VARIANCE_WXH_12_NEON(16, 32)
+
+HBD_VARIANCE_WXH_12_NEON(32, 16)
+HBD_VARIANCE_WXH_12_NEON(32, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64)
+
+#define HIGHBD_GET_VAR(S) \
+ void vpx_highbd_8_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ *sum = (int)sum_long; \
+ } \
+ \
+ void vpx_highbd_10_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ } \
+ \
+ void vpx_highbd_12_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ }
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ uint16x8_t diff = vabdq_u16(s, r);
+
+ sse_u32[0] =
+ vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff));
+ sse_u32[1] =
+ vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h / 2;
+ do {
+ uint16x8_t s0, s1, r0, r1;
+ uint8x16_t s, r, diff;
+
+ s0 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ s1 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ r0 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+ r1 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+
+ s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(sse_u32);
+ return *sse;
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s0, s1, r0, r1;
+ uint8x16_t s, r, diff;
+
+ s0 = vld1q_u16(src_ptr);
+ s1 = vld1q_u16(src_ptr + 8);
+ r0 = vld1q_u16(ref_ptr);
+ r1 = vld1q_u16(ref_ptr + 8);
+
+ s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(sse_u32);
+ return *sse;
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h,
+ sse);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h,
+ sse);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+#define HIGHBD_MSE_WXH_NEON(w, h) \
+ uint32_t vpx_highbd_8_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_10_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_12_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+ return *sse; \
+ }
+
+HIGHBD_MSE_WXH_NEON(16, 16)
+HIGHBD_MSE_WXH_NEON(16, 8)
+HIGHBD_MSE_WXH_NEON(8, 16)
+HIGHBD_MSE_WXH_NEON(8, 8)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
new file mode 100644
index 0000000000..47684473ca
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -0,0 +1,931 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+}
+
+static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+}
+
+static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+}
+
+static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2, const uint16x8_t s3,
+ const uint16x8_t s4, const uint16x8_t s5,
+ const uint16x8_t s6, const uint16x8_t s7) {
+ vst1q_u16(s, s0);
+ s += p;
+ vst1q_u16(s, s1);
+ s += p;
+ vst1q_u16(s, s2);
+ s += p;
+ vst1q_u16(s, s3);
+ s += p;
+ vst1q_u16(s, s4);
+ s += p;
+ vst1q_u16(s, s5);
+ s += p;
+ vst1q_u16(s, s6);
+ s += p;
+ vst1q_u16(s, s7);
+}
+
+static INLINE int32x4_t highbd_convolve8_4(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
+ const int16x4_t filters_lo = vget_low_s16(filters);
+ const int16x4_t filters_hi = vget_high_s16(filters);
+ int32x4_t sum;
+
+ sum = vmull_lane_s16(s0, filters_lo, 0);
+ sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
+ sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
+ sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
+ sum = vmlal_lane_s16(sum, s4, filters_hi, 0);
+ sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
+ sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
+ sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
+ return sum;
+}
+
+static INLINE uint16x8_t
+highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filters, const uint16x8_t max) {
+ const int16x4_t filters_lo = vget_low_s16(filters);
+ const int16x4_t filters_hi = vget_high_s16(filters);
+ int32x4_t sum0, sum1;
+ uint16x8_t d;
+
+ sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
+ sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
+ d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
+ d = vminq_u16(d, max);
+ return d;
+}
+
+void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ } else {
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ uint16x8_t t0, t1, t2, t3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+
+ src -= 3;
+
+ if (h == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int32x4_t d0, d1, d2, d3;
+ uint16x8_t d01, d23;
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u16_8x4(&t0, &t1, &t2, &t3);
+ s0 = vreinterpret_s16_u16(vget_low_u16(t0));
+ s1 = vreinterpret_s16_u16(vget_low_u16(t1));
+ s2 = vreinterpret_s16_u16(vget_low_u16(t2));
+ s3 = vreinterpret_s16_u16(vget_low_u16(t3));
+ s4 = vreinterpret_s16_u16(vget_high_u16(t0));
+ s5 = vreinterpret_s16_u16(vget_high_u16(t1));
+ s6 = vreinterpret_s16_u16(vget_high_u16(t2));
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ src += 7;
+
+ do {
+ load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
+ transpose_s16_4x4d(&s7, &s8, &s9, &s10);
+
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+ d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+ d01 = vminq_u16(d01, max);
+ d23 = vminq_u16(d23, max);
+ transpose_u16_4x4q(&d01, &d23);
+
+ vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
+ vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
+ vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4;
+ dst += 4;
+ w -= 4;
+ } while (w > 0);
+ } else {
+ int16x8_t t4, t5, t6, t7;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint16x8_t d0, d1, d2, d3;
+
+ if (w == 4) {
+ do {
+ load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+ &s5, &s6, &s7);
+ transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
+ &t4, &t5, &t6, &t7);
+ src += 8 * src_stride;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+ transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ d3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+ transpose_u16_8x4(&d0, &d1, &d2, &d3);
+ vst1_u16(dst, vget_low_u16(d0));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d1));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d2));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d3));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d0));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d1));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d2));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d3));
+ dst += dst_stride;
+ h -= 8;
+ } while (h > 0);
+ } else {
+ int width;
+ const uint16_t *s;
+ uint16_t *d;
+ int16x8_t s11, s12, s13, s14;
+ uint16x8_t d4, d5, d6, d7;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+ &s5, &s6, &s7);
+ transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ width = w;
+ s = src + 7;
+ d = dst;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+
+ do {
+ load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
+ &s12, &s13, &s14);
+ transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
+
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
+ max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
+ max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
+ max);
+ d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
+ max);
+ d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
+ max);
+ d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
+ max);
+ d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
+ max);
+ d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
+ filters, max);
+
+ transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+ store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+ }
+ }
+}
+
+void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
+ ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4,
+ int y_step_q4, int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+ } else {
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+
+ src -= 3;
+
+ if (h == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int32x4_t d0, d1, d2, d3;
+ uint16x8_t t0, t1, t2, t3;
+ uint16x8_t d01, d23, t01, t23;
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u16_8x4(&t0, &t1, &t2, &t3);
+ s0 = vreinterpret_s16_u16(vget_low_u16(t0));
+ s1 = vreinterpret_s16_u16(vget_low_u16(t1));
+ s2 = vreinterpret_s16_u16(vget_low_u16(t2));
+ s3 = vreinterpret_s16_u16(vget_low_u16(t3));
+ s4 = vreinterpret_s16_u16(vget_high_u16(t0));
+ s5 = vreinterpret_s16_u16(vget_high_u16(t1));
+ s6 = vreinterpret_s16_u16(vget_high_u16(t2));
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ src += 7;
+
+ do {
+ load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
+ transpose_s16_4x4d(&s7, &s8, &s9, &s10);
+
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+ t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+ t01 = vminq_u16(t01, max);
+ t23 = vminq_u16(t23, max);
+ transpose_u16_4x4q(&t01, &t23);
+
+ d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+ vld1_u16(dst + 2 * dst_stride));
+ d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
+ vld1_u16(dst + 3 * dst_stride));
+ d01 = vrhaddq_u16(d01, t01);
+ d23 = vrhaddq_u16(d23, t23);
+
+ vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
+ vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
+ vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4;
+ dst += 4;
+ w -= 4;
+ } while (w > 0);
+ } else {
+ int16x8_t t4, t5, t6, t7;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+
+ if (w == 4) {
+ do {
+ load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+ &s5, &s6, &s7);
+ transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
+ &t4, &t5, &t6, &t7);
+ src += 8 * src_stride;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+ transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ t3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+ transpose_u16_8x4(&t0, &t1, &t2, &t3);
+
+ d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+ vld1_u16(dst + 4 * dst_stride));
+ d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
+ vld1_u16(dst + 5 * dst_stride));
+ d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
+ vld1_u16(dst + 6 * dst_stride));
+ d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
+ vld1_u16(dst + 7 * dst_stride));
+ d0 = vrhaddq_u16(d0, t0);
+ d1 = vrhaddq_u16(d1, t1);
+ d2 = vrhaddq_u16(d2, t2);
+ d3 = vrhaddq_u16(d3, t3);
+
+ vst1_u16(dst, vget_low_u16(d0));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d1));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d2));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d3));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d0));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d1));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d2));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d3));
+ dst += dst_stride;
+ h -= 8;
+ } while (h > 0);
+ } else {
+ int width;
+ const uint16_t *s;
+ uint16_t *d;
+ int16x8_t s11, s12, s13, s14;
+ uint16x8_t d4, d5, d6, d7;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+ &s5, &s6, &s7);
+ transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ width = w;
+ s = src + 7;
+ d = dst;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+
+ do {
+ load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
+ &s12, &s13, &s14);
+ transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
+
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
+ max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
+ max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
+ max);
+ d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
+ max);
+ d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
+ max);
+ d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
+ max);
+ d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
+ max);
+ d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
+ filters, max);
+
+ transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+ d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
+ d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
+ d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
+ d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
+
+ store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+ }
+ }
+}
+
+void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ } else {
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int32x4_t d0, d1, d2, d3;
+ uint16x8_t d01, d23;
+
+ s0 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s1 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s2 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s3 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s4 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s5 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s6 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+
+ do {
+ s7 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s8 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s9 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s10 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+ d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+ d01 = vminq_u16(d01, max);
+ d23 = vminq_u16(d23, max);
+ vst1_u16(dst, vget_low_u16(d01));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d01));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d23));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d23));
+ dst += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ int height;
+ const uint16_t *s;
+ uint16_t *d;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint16x8_t d0, d1, d2, d3;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ s = src;
+ s0 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s1 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s2 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s3 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s4 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s5 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s6 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ d = dst;
+ height = h;
+
+ do {
+ s7 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s8 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s9 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s10 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ d3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+ vst1q_u16(d, d0);
+ d += dst_stride;
+ vst1q_u16(d, d1);
+ d += dst_stride;
+ vst1q_u16(d, d2);
+ d += dst_stride;
+ vst1q_u16(d, d3);
+ d += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+ }
+}
+
+void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
+ ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+ } else {
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int32x4_t d0, d1, d2, d3;
+ uint16x8_t d01, d23, t01, t23;
+
+ s0 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s1 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s2 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s3 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s4 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s5 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s6 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+
+ do {
+ s7 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s8 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s9 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s10 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+ t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+ t01 = vminq_u16(t01, max);
+ t23 = vminq_u16(t23, max);
+
+ d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+ vld1_u16(dst + 1 * dst_stride));
+ d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
+ vld1_u16(dst + 3 * dst_stride));
+ d01 = vrhaddq_u16(d01, t01);
+ d23 = vrhaddq_u16(d23, t23);
+
+ vst1_u16(dst, vget_low_u16(d01));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d01));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d23));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d23));
+ dst += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ int height;
+ const uint16_t *s;
+ uint16_t *d;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ s = src;
+ s0 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s1 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s2 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s3 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s4 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s5 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s6 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ d = dst;
+ height = h;
+
+ do {
+ s7 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s8 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s9 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s10 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ t3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+ d0 = vld1q_u16(d + 0 * dst_stride);
+ d1 = vld1q_u16(d + 1 * dst_stride);
+ d2 = vld1q_u16(d + 2 * dst_stride);
+ d3 = vld1q_u16(d + 3 * dst_stride);
+ d0 = vrhaddq_u16(d0, t0);
+ d1 = vrhaddq_u16(d1, t1);
+ d2 = vrhaddq_u16(d2, t2);
+ d3 = vrhaddq_u16(d3, t3);
+
+ vst1q_u16(d, d0);
+ d += dst_stride;
+ vst1q_u16(d, d1);
+ d += dst_stride;
+ vst1q_u16(d, d2);
+ d += dst_stride;
+ vst1q_u16(d, d3);
+ d += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
new file mode 100644
index 0000000000..765a054f8d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ (void)bd;
+
+ if (w < 8) { // avg4
+ uint16x4_t s0, s1, d0, d1;
+ uint16x8_t s01, d01;
+ do {
+ s0 = vld1_u16(src);
+ d0 = vld1_u16(dst);
+ src += src_stride;
+ s1 = vld1_u16(src);
+ d1 = vld1_u16(dst + dst_stride);
+ src += src_stride;
+ s01 = vcombine_u16(s0, s1);
+ d01 = vcombine_u16(d0, d1);
+ d01 = vrhaddq_u16(s01, d01);
+ vst1_u16(dst, vget_low_u16(d01));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d01));
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else if (w == 8) { // avg8
+ uint16x8_t s0, s1, d0, d1;
+ do {
+ s0 = vld1q_u16(src);
+ d0 = vld1q_u16(dst);
+ src += src_stride;
+ s1 = vld1q_u16(src);
+ d1 = vld1q_u16(dst + dst_stride);
+ src += src_stride;
+
+ d0 = vrhaddq_u16(s0, d0);
+ d1 = vrhaddq_u16(s1, d1);
+
+ vst1q_u16(dst, d0);
+ dst += dst_stride;
+ vst1q_u16(dst, d1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else if (w < 32) { // avg16
+ uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h;
+ do {
+ s0l = vld1q_u16(src);
+ s0h = vld1q_u16(src + 8);
+ d0l = vld1q_u16(dst);
+ d0h = vld1q_u16(dst + 8);
+ src += src_stride;
+ s1l = vld1q_u16(src);
+ s1h = vld1q_u16(src + 8);
+ d1l = vld1q_u16(dst + dst_stride);
+ d1h = vld1q_u16(dst + dst_stride + 8);
+ src += src_stride;
+
+ d0l = vrhaddq_u16(s0l, d0l);
+ d0h = vrhaddq_u16(s0h, d0h);
+ d1l = vrhaddq_u16(s1l, d1l);
+ d1h = vrhaddq_u16(s1h, d1h);
+
+ vst1q_u16(dst, d0l);
+ vst1q_u16(dst + 8, d0h);
+ dst += dst_stride;
+ vst1q_u16(dst, d1l);
+ vst1q_u16(dst + 8, d1h);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else if (w == 32) { // avg32
+ uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ d0 = vld1q_u16(dst);
+ d1 = vld1q_u16(dst + 8);
+ d2 = vld1q_u16(dst + 16);
+ d3 = vld1q_u16(dst + 24);
+ src += src_stride;
+
+ d0 = vrhaddq_u16(s0, d0);
+ d1 = vrhaddq_u16(s1, d1);
+ d2 = vrhaddq_u16(s2, d2);
+ d3 = vrhaddq_u16(s3, d3);
+
+ vst1q_u16(dst, d0);
+ vst1q_u16(dst + 8, d1);
+ vst1q_u16(dst + 16, d2);
+ vst1q_u16(dst + 24, d3);
+ dst += dst_stride;
+
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ d0 = vld1q_u16(dst);
+ d1 = vld1q_u16(dst + 8);
+ d2 = vld1q_u16(dst + 16);
+ d3 = vld1q_u16(dst + 24);
+ src += src_stride;
+
+ d0 = vrhaddq_u16(s0, d0);
+ d1 = vrhaddq_u16(s1, d1);
+ d2 = vrhaddq_u16(s2, d2);
+ d3 = vrhaddq_u16(s3, d3);
+
+ vst1q_u16(dst, d0);
+ vst1q_u16(dst + 8, d1);
+ vst1q_u16(dst + 16, d2);
+ vst1q_u16(dst + 24, d3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else { // avg64
+ uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ d0 = vld1q_u16(dst);
+ d1 = vld1q_u16(dst + 8);
+ d2 = vld1q_u16(dst + 16);
+ d3 = vld1q_u16(dst + 24);
+
+ d0 = vrhaddq_u16(s0, d0);
+ d1 = vrhaddq_u16(s1, d1);
+ d2 = vrhaddq_u16(s2, d2);
+ d3 = vrhaddq_u16(s3, d3);
+
+ vst1q_u16(dst, d0);
+ vst1q_u16(dst + 8, d1);
+ vst1q_u16(dst + 16, d2);
+ vst1q_u16(dst + 24, d3);
+
+ s0 = vld1q_u16(src + 32);
+ s1 = vld1q_u16(src + 40);
+ s2 = vld1q_u16(src + 48);
+ s3 = vld1q_u16(src + 56);
+ d0 = vld1q_u16(dst + 32);
+ d1 = vld1q_u16(dst + 40);
+ d2 = vld1q_u16(dst + 48);
+ d3 = vld1q_u16(dst + 56);
+
+ d0 = vrhaddq_u16(s0, d0);
+ d1 = vrhaddq_u16(s1, d1);
+ d2 = vrhaddq_u16(s2, d2);
+ d3 = vrhaddq_u16(s3, d3);
+
+ vst1q_u16(dst + 32, d0);
+ vst1q_u16(dst + 40, d1);
+ vst1q_u16(dst + 48, d2);
+ vst1q_u16(dst + 56, d3);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
new file mode 100644
index 0000000000..7751082083
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ (void)bd;
+
+ if (w < 8) { // copy4
+ uint16x4_t s0, s1;
+ do {
+ s0 = vld1_u16(src);
+ src += src_stride;
+ s1 = vld1_u16(src);
+ src += src_stride;
+
+ vst1_u16(dst, s0);
+ dst += dst_stride;
+ vst1_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) { // copy8
+ uint16x8_t s0, s1;
+ do {
+ s0 = vld1q_u16(src);
+ src += src_stride;
+ s1 = vld1q_u16(src);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ dst += dst_stride;
+ vst1q_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w < 32) { // copy16
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ src += src_stride;
+ s2 = vld1q_u16(src);
+ s3 = vld1q_u16(src + 8);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ dst += dst_stride;
+ vst1q_u16(dst, s2);
+ vst1q_u16(dst + 8, s3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 32) { // copy32
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ vst1q_u16(dst + 16, s2);
+ vst1q_u16(dst + 24, s3);
+ dst += dst_stride;
+ } while (--h != 0);
+ } else { // copy64
+ uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ s4 = vld1q_u16(src + 32);
+ s5 = vld1q_u16(src + 40);
+ s6 = vld1q_u16(src + 48);
+ s7 = vld1q_u16(src + 56);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ vst1q_u16(dst + 16, s2);
+ vst1q_u16(dst + 24, s3);
+ vst1q_u16(dst + 32, s4);
+ vst1q_u16(dst + 40, s5);
+ vst1q_u16(dst + 48, s6);
+ vst1q_u16(dst + 56, s7);
+ dst += dst_stride;
+ } while (--h != 0);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
new file mode 100644
index 0000000000..414ade3530
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
+ // + 1 to make it divisible by 4
+ uint16_t temp[64 * 136];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ /* Filter starting 3 lines back. The neon implementation will ignore the given
+ * height and filter a multiple of 4 lines. Since this goes in to the temp
+ * buffer which has lots of extra room and is subsequently discarded this is
+ * safe if somewhat less than ideal. */
+ vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height, bd);
+
+ /* Step into the temp buffer 3 lines to get the actual frame data */
+ vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ // + 1 to make it divisible by 4
+ uint16_t temp[64 * 136];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ /* This implementation has the same issues as above. In addition, we only want
+ * to average the values after both passes.
+ */
+ vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height, bd);
+ vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
new file mode 100644
index 0000000000..bf5192a683
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a = vld1q_u8(*dest);
+ const uint8x16_t b = vqaddq_u8(a, res);
+ vst1q_u8(*dest, b);
+ *dest += stride;
+}
+
+static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a = vld1q_u8(*dest);
+ const uint8x16_t b = vqsubq_u8(a, res);
+ vst1q_u8(*dest, b);
+ *dest += stride;
+}
+
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+
+ if (a1 >= 0) {
+ const uint8x16_t dc = create_dcq(a1);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ } else {
+ const uint8x16_t dc = create_dcq(-a1);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
new file mode 100644
index 0000000000..fc7f4a7747
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -0,0 +1,764 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
+ int16x4_t *const d1) {
+ *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+}
+
+static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
+ const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int32x4_t *const t32) {
+ t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
+ t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
+ t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
+}
+
+static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x4_t *const d0, int16x4_t *const d1) {
+ int32x4_t t32[2];
+
+ idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+ wrap_low_4x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x4_t *const d0,
+ int16x4_t *const d1) {
+ int32x4_t t32[2];
+
+ idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+ t32[1] = vnegq_s32(t32[1]);
+ wrap_low_4x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x4_t *const d0,
+ int16x4_t *const d1) {
+ int32x4_t t32[3];
+
+ t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
+ t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+ t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+ wrap_low_4x2(t32, d0, d1);
+}
+
+void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
+ void *const dest, const int stride,
+ const int highbd_flag) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
+ const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
+ const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1);
+ int16x8_t in[16], step1[16], step2[16], out[16];
+
+ // Load input (16x8)
+ if (output) {
+ const tran_low_t *inputT = (const tran_low_t *)input;
+ in[0] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[8] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[1] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[9] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[2] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[10] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[3] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[11] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[4] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[12] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[5] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[13] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[6] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[14] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[7] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[15] = load_tran_low_to_s16q(inputT);
+ } else {
+ const int16_t *inputT = (const int16_t *)input;
+ in[0] = vld1q_s16(inputT);
+ inputT += 8;
+ in[8] = vld1q_s16(inputT);
+ inputT += 8;
+ in[1] = vld1q_s16(inputT);
+ inputT += 8;
+ in[9] = vld1q_s16(inputT);
+ inputT += 8;
+ in[2] = vld1q_s16(inputT);
+ inputT += 8;
+ in[10] = vld1q_s16(inputT);
+ inputT += 8;
+ in[3] = vld1q_s16(inputT);
+ inputT += 8;
+ in[11] = vld1q_s16(inputT);
+ inputT += 8;
+ in[4] = vld1q_s16(inputT);
+ inputT += 8;
+ in[12] = vld1q_s16(inputT);
+ inputT += 8;
+ in[5] = vld1q_s16(inputT);
+ inputT += 8;
+ in[13] = vld1q_s16(inputT);
+ inputT += 8;
+ in[6] = vld1q_s16(inputT);
+ inputT += 8;
+ in[14] = vld1q_s16(inputT);
+ inputT += 8;
+ in[7] = vld1q_s16(inputT);
+ inputT += 8;
+ in[15] = vld1q_s16(inputT);
+ }
+
+ // Transpose
+ transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+ transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+ &in[15]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[1] = in[16 / 2];
+ step1[2] = in[8 / 2];
+ step1[3] = in[24 / 2];
+ step1[4] = in[4 / 2];
+ step1[5] = in[20 / 2];
+ step1[6] = in[12 / 2];
+ step1[7] = in[28 / 2];
+ step1[8] = in[2 / 2];
+ step1[9] = in[18 / 2];
+ step1[10] = in[10 / 2];
+ step1[11] = in[26 / 2];
+ step1[12] = in[6 / 2];
+ step1[13] = in[22 / 2];
+ step1[14] = in[14 / 2];
+ step1[15] = in[30 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+ idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
+ idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
+ &step2[14]);
+ idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
+ &step2[13]);
+ idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
+ idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
+ step1[8] = vaddq_s16(step2[8], step2[9]);
+ step1[9] = vsubq_s16(step2[8], step2[9]);
+ step1[10] = vsubq_s16(step2[11], step2[10]);
+ step1[11] = vaddq_s16(step2[11], step2[10]);
+ step1[12] = vaddq_s16(step2[12], step2[13]);
+ step1[13] = vsubq_s16(step2[12], step2[13]);
+ step1[14] = vsubq_s16(step2[15], step2[14]);
+ step1[15] = vaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+ idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
+ idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = vaddq_s16(step2[0], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s16(step2[8], step2[11]);
+ step1[9] = vaddq_s16(step2[9], step2[10]);
+ step1[10] = vsubq_s16(step2[9], step2[10]);
+ step1[11] = vsubq_s16(step2[8], step2[11]);
+ step1[12] = vsubq_s16(step2[15], step2[12]);
+ step1[13] = vsubq_s16(step2[14], step2[13]);
+ step1[14] = vaddq_s16(step2[14], step2[13]);
+ step1[15] = vaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vaddq_s16(step1[0], step1[7]);
+ step2[1] = vaddq_s16(step1[1], step1[6]);
+ step2[2] = vaddq_s16(step1[2], step1[5]);
+ step2[3] = vaddq_s16(step1[3], step1[4]);
+ step2[4] = vsubq_s16(step1[3], step1[4]);
+ step2[5] = vsubq_s16(step1[2], step1[5]);
+ step2[6] = vsubq_s16(step1[1], step1[6]);
+ step2[7] = vsubq_s16(step1[0], step1[7]);
+ idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ idct16x16_add_stage7(step2, out);
+
+ if (output) {
+ idct16x16_store_pass1(out, output);
+ } else {
+ if (highbd_flag) {
+ idct16x16_add_store_bd8(out, dest, stride);
+ } else {
+ idct16x16_add_store(out, dest, stride);
+ }
+ }
+}
+
+void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
+ void *const dest, const int stride,
+ const int highbd_flag) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+ const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+ const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+ const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+ const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+ int16x8_t in[8], step1[16], step2[16], out[16];
+
+ // Load input (8x8)
+ if (output) {
+ const tran_low_t *inputT = (const tran_low_t *)input;
+ in[0] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[1] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[2] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[3] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[4] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[5] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[6] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[7] = load_tran_low_to_s16q(inputT);
+ } else {
+ const int16_t *inputT = (const int16_t *)input;
+ in[0] = vld1q_s16(inputT);
+ inputT += 16;
+ in[1] = vld1q_s16(inputT);
+ inputT += 16;
+ in[2] = vld1q_s16(inputT);
+ inputT += 16;
+ in[3] = vld1q_s16(inputT);
+ inputT += 16;
+ in[4] = vld1q_s16(inputT);
+ inputT += 16;
+ in[5] = vld1q_s16(inputT);
+ inputT += 16;
+ in[6] = vld1q_s16(inputT);
+ inputT += 16;
+ in[7] = vld1q_s16(inputT);
+ }
+
+ // Transpose
+ transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[2] = in[8 / 2];
+ step1[4] = in[4 / 2];
+ step1[6] = in[12 / 2];
+ step1[8] = in[2 / 2];
+ step1[10] = in[10 / 2];
+ step1[12] = in[6 / 2];
+ step1[14] = in[14 / 2]; // 0 in pass 1
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[4] = step1[4];
+ step2[6] = step1[6];
+ step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
+ step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3);
+ step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3);
+ step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+ step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+ step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2);
+ step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2);
+ step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+ step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2);
+ step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1);
+ step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+ step1[8] = vaddq_s16(step2[8], step2[9]);
+ step1[9] = vsubq_s16(step2[8], step2[9]);
+ step1[10] = vsubq_s16(step2[11], step2[10]);
+ step1[11] = vaddq_s16(step2[11], step2[10]);
+ step1[12] = vaddq_s16(step2[12], step2[13]);
+ step1[13] = vsubq_s16(step2[12], step2[13]);
+ step1[14] = vsubq_s16(step2[15], step2[14]);
+ step1[15] = vaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+ step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
+ step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3);
+ step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1);
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = vaddq_s16(step2[0], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s16(step2[8], step2[11]);
+ step1[9] = vaddq_s16(step2[9], step2[10]);
+ step1[10] = vsubq_s16(step2[9], step2[10]);
+ step1[11] = vsubq_s16(step2[8], step2[11]);
+ step1[12] = vsubq_s16(step2[15], step2[12]);
+ step1[13] = vsubq_s16(step2[14], step2[13]);
+ step1[14] = vaddq_s16(step2[14], step2[13]);
+ step1[15] = vaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vaddq_s16(step1[0], step1[7]);
+ step2[1] = vaddq_s16(step1[1], step1[6]);
+ step2[2] = vaddq_s16(step1[2], step1[5]);
+ step2[3] = vaddq_s16(step1[3], step1[4]);
+ step2[4] = vsubq_s16(step1[3], step1[4]);
+ step2[5] = vsubq_s16(step1[2], step1[5]);
+ step2[6] = vsubq_s16(step1[1], step1[6]);
+ step2[7] = vsubq_s16(step1[0], step1[7]);
+ idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ idct16x16_add_stage7(step2, out);
+
+ if (output) {
+ idct16x16_store_pass1(out, output);
+ } else {
+ if (highbd_flag) {
+ idct16x16_add_store_bd8(out, dest, stride);
+ } else {
+ idct16x16_add_store(out, dest, stride);
+ }
+ }
+}
+
+void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+ int16_t *output) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+ const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+ const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+ const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+ const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+ int16x4_t in[4], step1[16], step2[16], out[16];
+
+ // Load input (4x4)
+ in[0] = load_tran_low_to_s16d(input);
+ input += 16;
+ in[1] = load_tran_low_to_s16d(input);
+ input += 16;
+ in[2] = load_tran_low_to_s16d(input);
+ input += 16;
+ in[3] = load_tran_low_to_s16d(input);
+
+ // Transpose
+ transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
+ step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+ step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+ step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+ step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vadd_s16(step2[8], step2[11]);
+ step1[9] = vadd_s16(step2[9], step2[10]);
+ step1[10] = vsub_s16(step2[9], step2[10]);
+ step1[11] = vsub_s16(step2[8], step2[11]);
+ step1[12] = vsub_s16(step2[15], step2[12]);
+ step1[13] = vsub_s16(step2[14], step2[13]);
+ step1[14] = vadd_s16(step2[14], step2[13]);
+ step1[15] = vadd_s16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vadd_s16(step1[0], step1[7]);
+ step2[1] = vadd_s16(step1[1], step1[6]);
+ step2[2] = vadd_s16(step1[2], step1[5]);
+ step2[3] = vadd_s16(step1[3], step1[4]);
+ step2[4] = vsub_s16(step1[3], step1[4]);
+ step2[5] = vsub_s16(step1[2], step1[5]);
+ step2[6] = vsub_s16(step1[1], step1[6]);
+ step2[7] = vsub_s16(step1[0], step1[7]);
+ idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ out[0] = vadd_s16(step2[0], step2[15]);
+ out[1] = vadd_s16(step2[1], step2[14]);
+ out[2] = vadd_s16(step2[2], step2[13]);
+ out[3] = vadd_s16(step2[3], step2[12]);
+ out[4] = vadd_s16(step2[4], step2[11]);
+ out[5] = vadd_s16(step2[5], step2[10]);
+ out[6] = vadd_s16(step2[6], step2[9]);
+ out[7] = vadd_s16(step2[7], step2[8]);
+ out[8] = vsub_s16(step2[7], step2[8]);
+ out[9] = vsub_s16(step2[6], step2[9]);
+ out[10] = vsub_s16(step2[5], step2[10]);
+ out[11] = vsub_s16(step2[4], step2[11]);
+ out[12] = vsub_s16(step2[3], step2[12]);
+ out[13] = vsub_s16(step2[2], step2[13]);
+ out[14] = vsub_s16(step2[1], step2[14]);
+ out[15] = vsub_s16(step2[0], step2[15]);
+
+ // pass 1: save the result into output
+ vst1_s16(output, out[0]);
+ output += 4;
+ vst1_s16(output, out[1]);
+ output += 4;
+ vst1_s16(output, out[2]);
+ output += 4;
+ vst1_s16(output, out[3]);
+ output += 4;
+ vst1_s16(output, out[4]);
+ output += 4;
+ vst1_s16(output, out[5]);
+ output += 4;
+ vst1_s16(output, out[6]);
+ output += 4;
+ vst1_s16(output, out[7]);
+ output += 4;
+ vst1_s16(output, out[8]);
+ output += 4;
+ vst1_s16(output, out[9]);
+ output += 4;
+ vst1_s16(output, out[10]);
+ output += 4;
+ vst1_s16(output, out[11]);
+ output += 4;
+ vst1_s16(output, out[12]);
+ output += 4;
+ vst1_s16(output, out[13]);
+ output += 4;
+ vst1_s16(output, out[14]);
+ output += 4;
+ vst1_s16(output, out[15]);
+}
+
+void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
+ int16_t *const output, void *const dest,
+ const int stride,
+ const int highbd_flag) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+ const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+ const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+ const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+ const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+ int16x4_t ind[8];
+ int16x8_t in[4], step1[16], step2[16], out[16];
+
+ // Load input (4x8)
+ ind[0] = vld1_s16(input);
+ input += 4;
+ ind[1] = vld1_s16(input);
+ input += 4;
+ ind[2] = vld1_s16(input);
+ input += 4;
+ ind[3] = vld1_s16(input);
+ input += 4;
+ ind[4] = vld1_s16(input);
+ input += 4;
+ ind[5] = vld1_s16(input);
+ input += 4;
+ ind[6] = vld1_s16(input);
+ input += 4;
+ ind[7] = vld1_s16(input);
+
+ // Transpose
+ transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
+ ind[7], &in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
+ step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+ step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+ step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+ step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s16(step2[8], step2[11]);
+ step1[9] = vaddq_s16(step2[9], step2[10]);
+ step1[10] = vsubq_s16(step2[9], step2[10]);
+ step1[11] = vsubq_s16(step2[8], step2[11]);
+ step1[12] = vsubq_s16(step2[15], step2[12]);
+ step1[13] = vsubq_s16(step2[14], step2[13]);
+ step1[14] = vaddq_s16(step2[14], step2[13]);
+ step1[15] = vaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vaddq_s16(step1[0], step1[7]);
+ step2[1] = vaddq_s16(step1[1], step1[6]);
+ step2[2] = vaddq_s16(step1[2], step1[5]);
+ step2[3] = vaddq_s16(step1[3], step1[4]);
+ step2[4] = vsubq_s16(step1[3], step1[4]);
+ step2[5] = vsubq_s16(step1[2], step1[5]);
+ step2[6] = vsubq_s16(step1[1], step1[6]);
+ step2[7] = vsubq_s16(step1[0], step1[7]);
+ idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ idct16x16_add_stage7(step2, out);
+
+ if (output) {
+ idct16x16_store_pass1(out, output);
+ } else {
+ if (highbd_flag) {
+ idct16x16_add_store_bd8(out, dest, stride);
+ } else {
+ idct16x16_add_store(out, dest, stride);
+ }
+ }
+}
+
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0);
+
+ // Parallel idct on the lower 8 rows
+ vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
+ stride, 0);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
+ 0);
+}
+
+void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
+ 0);
+}
+
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16_t row_idct_output[4 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
+ stride, 0);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
new file mode 100644
index 0000000000..057731ad92
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -0,0 +1,674 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_8x8_s16(const tran_low_t *input, int16x8_t *const in0,
+ int16x8_t *const in1, int16x8_t *const in2,
+ int16x8_t *const in3, int16x8_t *const in4,
+ int16x8_t *const in5, int16x8_t *const in6,
+ int16x8_t *const in7) {
+ *in0 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in1 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in2 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in3 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in4 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in5 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in6 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in7 = load_tran_low_to_s16q(input);
+}
+
+static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0,
+ int16x4_t *const in1, int16x4_t *const in2,
+ int16x4_t *const in3, int16x4_t *const in4,
+ int16x4_t *const in5, int16x4_t *const in6,
+ int16x4_t *const in7) {
+ *in0 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in1 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in2 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in3 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in4 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in5 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in6 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in7 = load_tran_low_to_s16d(input);
+}
+
+// Only for the first pass of the _135_ variant. Since it only uses values from
+// the top left 16x16 it can safely assume all the remaining values are 0 and
+// skip an awful lot of calculations. In fact, only the first 12 columns make
+// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
+// used so it skips any calls to input[12|13|14|15] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 12x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
+// coefficients as follows:
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// 0 0 2 5 10 17 25 38 47 62 83 101 121
+// 1 1 4 8 15 22 30 45 58 74 92 112 133
+// 2 3 7 12 18 28 36 52 64 82 102 118
+// 3 6 11 16 23 31 43 60 73 90 109 126
+// 4 9 14 19 29 37 50 65 78 98 116 134
+// 5 13 20 26 35 44 54 72 85 105 123
+// 6 21 27 33 42 53 63 80 94 113 132
+// 7 24 32 39 48 57 71 88 104 120
+// 8 34 40 46 56 68 81 96 111 130
+// 9 41 49 55 67 77 91 107 124
+// 10 51 59 66 76 89 99 119 131
+// 11 61 69 75 87 100 114 129
+// 12 70 79 86 97 108 122
+// 13 84 93 103 110 125
+// 14 98 106 115 127
+// 15 117 128
+void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output) {
+ int16x4_t tmp[8];
+ int16x8_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32];
+
+ load_8x8_s16(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+ transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ load_4x8_s16(input + 8, &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5],
+ &tmp[6], &tmp[7]);
+ transpose_s16_4x8(tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6],
+ tmp[7], &in[8], &in[9], &in[10], &in[11]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+ s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64);
+ s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64);
+
+ s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+ s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64);
+ s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64);
+
+ s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+ s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64);
+ s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64);
+
+ s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
+
+ s2[18] = vsubq_s16(s1[19], s1[18]);
+ s2[19] = vaddq_s16(s1[18], s1[19]);
+ s2[20] = vaddq_s16(s1[20], s1[21]);
+ s2[21] = vsubq_s16(s1[20], s1[21]);
+ s2[26] = vsubq_s16(s1[27], s1[26]);
+ s2[27] = vaddq_s16(s1[26], s1[27]);
+ s2[28] = vaddq_s16(s1[28], s1[29]);
+ s2[29] = vsubq_s16(s1[28], s1[29]);
+
+ // stage 3
+ s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+ s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+ s3[10] = vsubq_s16(s2[11], s2[10]);
+ s3[11] = vaddq_s16(s2[10], s2[11]);
+ s3[12] = vaddq_s16(s2[12], s2[13]);
+ s3[13] = vsubq_s16(s2[12], s2[13]);
+
+ s3[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+ cospi_28_64);
+ s3[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+ cospi_4_64);
+
+ s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64,
+ s2[29], -cospi_4_64);
+ s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29],
+ cospi_28_64);
+
+ s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64,
+ s2[26], cospi_12_64);
+ s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26],
+ cospi_20_64);
+
+ s3[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s3[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+ s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64);
+ s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64);
+
+ s4[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
+ cospi_24_64);
+ s4[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+ cospi_8_64);
+
+ s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64,
+ s3[13], -cospi_8_64);
+ s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13],
+ cospi_24_64);
+
+ s4[16] = vaddq_s16(s1[16], s2[19]);
+ s4[17] = vaddq_s16(s3[17], s3[18]);
+ s4[18] = vsubq_s16(s3[17], s3[18]);
+ s4[19] = vsubq_s16(s1[16], s2[19]);
+ s4[20] = vsubq_s16(s1[23], s2[20]);
+ s4[21] = vsubq_s16(s3[22], s3[21]);
+ s4[22] = vaddq_s16(s3[21], s3[22]);
+ s4[23] = vaddq_s16(s2[20], s1[23]);
+ s4[24] = vaddq_s16(s1[24], s2[27]);
+ s4[25] = vaddq_s16(s3[25], s3[26]);
+ s4[26] = vsubq_s16(s3[25], s3[26]);
+ s4[27] = vsubq_s16(s1[24], s2[27]);
+ s4[28] = vsubq_s16(s1[31], s2[28]);
+ s4[29] = vsubq_s16(s3[30], s3[29]);
+ s4[30] = vaddq_s16(s3[29], s3[30]);
+ s4[31] = vaddq_s16(s2[28], s1[31]);
+
+ // stage 5
+ s5[0] = vaddq_s16(s4[0], s4[3]);
+ s5[1] = vaddq_s16(s4[0], s4[2]);
+ s5[2] = vsubq_s16(s4[0], s4[2]);
+ s5[3] = vsubq_s16(s4[0], s4[3]);
+
+ s5[5] = sub_multiply_shift_and_narrow_s16(s3[7], s3[4], cospi_16_64);
+ s5[6] = add_multiply_shift_and_narrow_s16(s3[4], s3[7], cospi_16_64);
+
+ s5[8] = vaddq_s16(s2[8], s3[11]);
+ s5[9] = vaddq_s16(s4[9], s4[10]);
+ s5[10] = vsubq_s16(s4[9], s4[10]);
+ s5[11] = vsubq_s16(s2[8], s3[11]);
+ s5[12] = vsubq_s16(s2[15], s3[12]);
+ s5[13] = vsubq_s16(s4[14], s4[13]);
+ s5[14] = vaddq_s16(s4[13], s4[14]);
+ s5[15] = vaddq_s16(s2[15], s3[12]);
+
+ s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29],
+ cospi_24_64);
+ s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29],
+ cospi_8_64);
+
+ s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28],
+ cospi_24_64);
+ s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28],
+ cospi_8_64);
+
+ s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64,
+ s4[27], -cospi_8_64);
+ s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27],
+ cospi_24_64);
+
+ s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64,
+ s4[26], -cospi_8_64);
+ s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26],
+ cospi_24_64);
+
+ // stage 6
+ s6[0] = vaddq_s16(s5[0], s3[7]);
+ s6[1] = vaddq_s16(s5[1], s5[6]);
+ s6[2] = vaddq_s16(s5[2], s5[5]);
+ s6[3] = vaddq_s16(s5[3], s3[4]);
+ s6[4] = vsubq_s16(s5[3], s3[4]);
+ s6[5] = vsubq_s16(s5[2], s5[5]);
+ s6[6] = vsubq_s16(s5[1], s5[6]);
+ s6[7] = vsubq_s16(s5[0], s3[7]);
+
+ s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64);
+ s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64);
+
+ s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64);
+ s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64);
+
+ s6[16] = vaddq_s16(s4[16], s4[23]);
+ s6[17] = vaddq_s16(s4[17], s4[22]);
+ s6[18] = vaddq_s16(s5[18], s5[21]);
+ s6[19] = vaddq_s16(s5[19], s5[20]);
+ s6[20] = vsubq_s16(s5[19], s5[20]);
+ s6[21] = vsubq_s16(s5[18], s5[21]);
+ s6[22] = vsubq_s16(s4[17], s4[22]);
+ s6[23] = vsubq_s16(s4[16], s4[23]);
+
+ s6[24] = vsubq_s16(s4[31], s4[24]);
+ s6[25] = vsubq_s16(s4[30], s4[25]);
+ s6[26] = vsubq_s16(s5[29], s5[26]);
+ s6[27] = vsubq_s16(s5[28], s5[27]);
+ s6[28] = vaddq_s16(s5[27], s5[28]);
+ s6[29] = vaddq_s16(s5[26], s5[29]);
+ s6[30] = vaddq_s16(s4[25], s4[30]);
+ s6[31] = vaddq_s16(s4[24], s4[31]);
+
+ // stage 7
+ s7[0] = vaddq_s16(s6[0], s5[15]);
+ s7[1] = vaddq_s16(s6[1], s5[14]);
+ s7[2] = vaddq_s16(s6[2], s6[13]);
+ s7[3] = vaddq_s16(s6[3], s6[12]);
+ s7[4] = vaddq_s16(s6[4], s6[11]);
+ s7[5] = vaddq_s16(s6[5], s6[10]);
+ s7[6] = vaddq_s16(s6[6], s5[9]);
+ s7[7] = vaddq_s16(s6[7], s5[8]);
+ s7[8] = vsubq_s16(s6[7], s5[8]);
+ s7[9] = vsubq_s16(s6[6], s5[9]);
+ s7[10] = vsubq_s16(s6[5], s6[10]);
+ s7[11] = vsubq_s16(s6[4], s6[11]);
+ s7[12] = vsubq_s16(s6[3], s6[12]);
+ s7[13] = vsubq_s16(s6[2], s6[13]);
+ s7[14] = vsubq_s16(s6[1], s5[14]);
+ s7[15] = vsubq_s16(s6[0], s5[15]);
+
+ s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64);
+ s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64);
+
+ s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64);
+ s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64);
+
+ s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64);
+ s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64);
+
+ s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64);
+ s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64);
+
+ // final stage
+ vst1q_s16(output, vaddq_s16(s7[0], s6[31]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[1], s6[30]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[2], s6[29]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[3], s6[28]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[4], s7[27]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[5], s7[26]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[6], s7[25]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[7], s7[24]));
+ output += 16;
+
+ vst1q_s16(output, vaddq_s16(s7[8], s7[23]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[9], s7[22]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[10], s7[21]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[11], s7[20]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[12], s6[19]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[13], s6[18]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[14], s6[17]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[15], s6[16]));
+ output += 16;
+
+ vst1q_s16(output, vsubq_s16(s7[15], s6[16]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[14], s6[17]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[13], s6[18]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[12], s6[19]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[11], s7[20]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[10], s7[21]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[9], s7[22]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[8], s7[23]));
+ output += 16;
+
+ vst1q_s16(output, vsubq_s16(s7[7], s7[24]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[6], s7[25]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[5], s7[26]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[4], s7[27]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[3], s6[28]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[2], s6[29]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[1], s6[30]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[0], s6[31]));
+}
+
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
+ const int stride, const int highbd_flag) {
+ int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+ out[32];
+
+ load_and_transpose_s16_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4],
+ &in[5], &in[6], &in[7]);
+
+ load_and_transpose_s16_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11],
+ &in[12], &in[13], &in[14], &in[15]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+ s1[17] = multiply_shift_and_narrow_s16(in[15], -cospi_17_64);
+ s1[30] = multiply_shift_and_narrow_s16(in[15], cospi_15_64);
+
+ s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64);
+ s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64);
+
+ s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+ s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64);
+ s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64);
+
+ s1[22] = multiply_shift_and_narrow_s16(in[13], cospi_19_64);
+ s1[25] = multiply_shift_and_narrow_s16(in[13], cospi_13_64);
+
+ s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+ s2[9] = multiply_shift_and_narrow_s16(in[14], -cospi_18_64);
+ s2[14] = multiply_shift_and_narrow_s16(in[14], cospi_14_64);
+
+ s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64);
+ s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64);
+
+ s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
+
+ s2[16] = vaddq_s16(s1[16], s1[17]);
+ s2[17] = vsubq_s16(s1[16], s1[17]);
+ s2[18] = vsubq_s16(s1[19], s1[18]);
+ s2[19] = vaddq_s16(s1[18], s1[19]);
+ s2[20] = vaddq_s16(s1[20], s1[21]);
+ s2[21] = vsubq_s16(s1[20], s1[21]);
+ s2[22] = vsubq_s16(s1[23], s1[22]);
+ s2[23] = vaddq_s16(s1[22], s1[23]);
+ s2[24] = vaddq_s16(s1[24], s1[25]);
+ s2[25] = vsubq_s16(s1[24], s1[25]);
+ s2[26] = vsubq_s16(s1[27], s1[26]);
+ s2[27] = vaddq_s16(s1[26], s1[27]);
+ s2[28] = vaddq_s16(s1[28], s1[29]);
+ s2[29] = vsubq_s16(s1[28], s1[29]);
+ s2[30] = vsubq_s16(s1[31], s1[30]);
+ s2[31] = vaddq_s16(s1[30], s1[31]);
+
+ // stage 3
+ s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+ s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+ s3[5] = multiply_shift_and_narrow_s16(in[12], -cospi_20_64);
+ s3[6] = multiply_shift_and_narrow_s16(in[12], cospi_12_64);
+
+ s3[8] = vaddq_s16(s2[8], s2[9]);
+ s3[9] = vsubq_s16(s2[8], s2[9]);
+ s3[10] = vsubq_s16(s2[11], s2[10]);
+ s3[11] = vaddq_s16(s2[10], s2[11]);
+ s3[12] = vaddq_s16(s2[12], s2[13]);
+ s3[13] = vsubq_s16(s2[12], s2[13]);
+ s3[14] = vsubq_s16(s2[15], s2[14]);
+ s3[15] = vaddq_s16(s2[14], s2[15]);
+
+ s3[17] = multiply_accumulate_shift_and_narrow_s16(s2[17], -cospi_4_64, s2[30],
+ cospi_28_64);
+ s3[30] = multiply_accumulate_shift_and_narrow_s16(s2[17], cospi_28_64, s2[30],
+ cospi_4_64);
+
+ s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64,
+ s2[29], -cospi_4_64);
+ s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29],
+ cospi_28_64);
+
+ s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64,
+ s2[26], cospi_12_64);
+ s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26],
+ cospi_20_64);
+
+ s3[22] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_12_64,
+ s2[25], -cospi_20_64);
+ s3[25] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_20_64,
+ s2[25], cospi_12_64);
+
+ // stage 4
+ s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+ s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64);
+ s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64);
+
+ s4[4] = vaddq_s16(s3[4], s3[5]);
+ s4[5] = vsubq_s16(s3[4], s3[5]);
+ s4[6] = vsubq_s16(s3[7], s3[6]);
+ s4[7] = vaddq_s16(s3[6], s3[7]);
+
+ s4[9] = multiply_accumulate_shift_and_narrow_s16(s3[9], -cospi_8_64, s3[14],
+ cospi_24_64);
+ s4[14] = multiply_accumulate_shift_and_narrow_s16(s3[9], cospi_24_64, s3[14],
+ cospi_8_64);
+
+ s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64,
+ s3[13], -cospi_8_64);
+ s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13],
+ cospi_24_64);
+
+ s4[16] = vaddq_s16(s2[16], s2[19]);
+ s4[17] = vaddq_s16(s3[17], s3[18]);
+ s4[18] = vsubq_s16(s3[17], s3[18]);
+ s4[19] = vsubq_s16(s2[16], s2[19]);
+ s4[20] = vsubq_s16(s2[23], s2[20]);
+ s4[21] = vsubq_s16(s3[22], s3[21]);
+ s4[22] = vaddq_s16(s3[21], s3[22]);
+ s4[23] = vaddq_s16(s2[20], s2[23]);
+ s4[24] = vaddq_s16(s2[24], s2[27]);
+ s4[25] = vaddq_s16(s3[25], s3[26]);
+ s4[26] = vsubq_s16(s3[25], s3[26]);
+ s4[27] = vsubq_s16(s2[24], s2[27]);
+ s4[28] = vsubq_s16(s2[31], s2[28]);
+ s4[29] = vsubq_s16(s3[30], s3[29]);
+ s4[30] = vaddq_s16(s3[29], s3[30]);
+ s4[31] = vaddq_s16(s2[28], s2[31]);
+
+ // stage 5
+ s5[0] = vaddq_s16(s4[0], s4[3]);
+ s5[1] = vaddq_s16(s4[0], s4[2]);
+ s5[2] = vsubq_s16(s4[0], s4[2]);
+ s5[3] = vsubq_s16(s4[0], s4[3]);
+
+ s5[5] = sub_multiply_shift_and_narrow_s16(s4[6], s4[5], cospi_16_64);
+ s5[6] = add_multiply_shift_and_narrow_s16(s4[5], s4[6], cospi_16_64);
+
+ s5[8] = vaddq_s16(s3[8], s3[11]);
+ s5[9] = vaddq_s16(s4[9], s4[10]);
+ s5[10] = vsubq_s16(s4[9], s4[10]);
+ s5[11] = vsubq_s16(s3[8], s3[11]);
+ s5[12] = vsubq_s16(s3[15], s3[12]);
+ s5[13] = vsubq_s16(s4[14], s4[13]);
+ s5[14] = vaddq_s16(s4[13], s4[14]);
+ s5[15] = vaddq_s16(s3[15], s3[12]);
+
+ s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29],
+ cospi_24_64);
+ s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29],
+ cospi_8_64);
+
+ s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28],
+ cospi_24_64);
+ s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28],
+ cospi_8_64);
+
+ s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64,
+ s4[27], -cospi_8_64);
+ s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27],
+ cospi_24_64);
+
+ s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64,
+ s4[26], -cospi_8_64);
+ s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26],
+ cospi_24_64);
+
+ // stage 6
+ s6[0] = vaddq_s16(s5[0], s4[7]);
+ s6[1] = vaddq_s16(s5[1], s5[6]);
+ s6[2] = vaddq_s16(s5[2], s5[5]);
+ s6[3] = vaddq_s16(s5[3], s4[4]);
+ s6[4] = vsubq_s16(s5[3], s4[4]);
+ s6[5] = vsubq_s16(s5[2], s5[5]);
+ s6[6] = vsubq_s16(s5[1], s5[6]);
+ s6[7] = vsubq_s16(s5[0], s4[7]);
+
+ s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64);
+ s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64);
+
+ s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64);
+ s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64);
+
+ s6[16] = vaddq_s16(s4[16], s4[23]);
+ s6[17] = vaddq_s16(s4[17], s4[22]);
+ s6[18] = vaddq_s16(s5[18], s5[21]);
+ s6[19] = vaddq_s16(s5[19], s5[20]);
+ s6[20] = vsubq_s16(s5[19], s5[20]);
+ s6[21] = vsubq_s16(s5[18], s5[21]);
+ s6[22] = vsubq_s16(s4[17], s4[22]);
+ s6[23] = vsubq_s16(s4[16], s4[23]);
+ s6[24] = vsubq_s16(s4[31], s4[24]);
+ s6[25] = vsubq_s16(s4[30], s4[25]);
+ s6[26] = vsubq_s16(s5[29], s5[26]);
+ s6[27] = vsubq_s16(s5[28], s5[27]);
+ s6[28] = vaddq_s16(s5[27], s5[28]);
+ s6[29] = vaddq_s16(s5[26], s5[29]);
+ s6[30] = vaddq_s16(s4[25], s4[30]);
+ s6[31] = vaddq_s16(s4[24], s4[31]);
+
+ // stage 7
+ s7[0] = vaddq_s16(s6[0], s5[15]);
+ s7[1] = vaddq_s16(s6[1], s5[14]);
+ s7[2] = vaddq_s16(s6[2], s6[13]);
+ s7[3] = vaddq_s16(s6[3], s6[12]);
+ s7[4] = vaddq_s16(s6[4], s6[11]);
+ s7[5] = vaddq_s16(s6[5], s6[10]);
+ s7[6] = vaddq_s16(s6[6], s5[9]);
+ s7[7] = vaddq_s16(s6[7], s5[8]);
+ s7[8] = vsubq_s16(s6[7], s5[8]);
+ s7[9] = vsubq_s16(s6[6], s5[9]);
+ s7[10] = vsubq_s16(s6[5], s6[10]);
+ s7[11] = vsubq_s16(s6[4], s6[11]);
+ s7[12] = vsubq_s16(s6[3], s6[12]);
+ s7[13] = vsubq_s16(s6[2], s6[13]);
+ s7[14] = vsubq_s16(s6[1], s5[14]);
+ s7[15] = vsubq_s16(s6[0], s5[15]);
+
+ s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64);
+ s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64);
+
+ s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64);
+ s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64);
+
+ s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64);
+ s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64);
+
+ s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64);
+ s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64);
+
+ // final stage
+ out[0] = final_add(s7[0], s6[31]);
+ out[1] = final_add(s7[1], s6[30]);
+ out[2] = final_add(s7[2], s6[29]);
+ out[3] = final_add(s7[3], s6[28]);
+ out[4] = final_add(s7[4], s7[27]);
+ out[5] = final_add(s7[5], s7[26]);
+ out[6] = final_add(s7[6], s7[25]);
+ out[7] = final_add(s7[7], s7[24]);
+ out[8] = final_add(s7[8], s7[23]);
+ out[9] = final_add(s7[9], s7[22]);
+ out[10] = final_add(s7[10], s7[21]);
+ out[11] = final_add(s7[11], s7[20]);
+ out[12] = final_add(s7[12], s6[19]);
+ out[13] = final_add(s7[13], s6[18]);
+ out[14] = final_add(s7[14], s6[17]);
+ out[15] = final_add(s7[15], s6[16]);
+ out[16] = final_sub(s7[15], s6[16]);
+ out[17] = final_sub(s7[14], s6[17]);
+ out[18] = final_sub(s7[13], s6[18]);
+ out[19] = final_sub(s7[12], s6[19]);
+ out[20] = final_sub(s7[11], s7[20]);
+ out[21] = final_sub(s7[10], s7[21]);
+ out[22] = final_sub(s7[9], s7[22]);
+ out[23] = final_sub(s7[8], s7[23]);
+ out[24] = final_sub(s7[7], s7[24]);
+ out[25] = final_sub(s7[6], s7[25]);
+ out[26] = final_sub(s7[5], s7[26]);
+ out[27] = final_sub(s7[4], s7[27]);
+ out[28] = final_sub(s7[3], s6[28]);
+ out[29] = final_sub(s7[2], s6[29]);
+ out[30] = final_sub(s7[1], s6[30]);
+ out[31] = final_sub(s7[0], s6[31]);
+
+ if (highbd_flag) {
+ highbd_add_and_store_bd8(out, output, stride);
+ } else {
+ uint8_t *const outputT = (uint8_t *)output;
+ add_and_store_u8_s16(out + 0, outputT, stride);
+ add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
+ add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
+ add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
+ }
+}
+
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ int16_t temp[32 * 16];
+ int16_t *t = temp;
+
+ vpx_idct32_12_neon(input, temp);
+ vpx_idct32_12_neon(input + 32 * 8, temp + 8);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_idct32_16_neon(t, dest, stride, 0);
+ t += (16 * 8);
+ dest += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
new file mode 100644
index 0000000000..8920b93363
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a0 = vld1q_u8(*dest);
+ const uint8x16_t a1 = vld1q_u8(*dest + 16);
+ const uint8x16_t b0 = vqaddq_u8(a0, res);
+ const uint8x16_t b1 = vqaddq_u8(a1, res);
+ vst1q_u8(*dest, b0);
+ vst1q_u8(*dest + 16, b1);
+ *dest += stride;
+}
+
+static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a0 = vld1q_u8(*dest);
+ const uint8x16_t a1 = vld1q_u8(*dest + 16);
+ const uint8x16_t b0 = vqsubq_u8(a0, res);
+ const uint8x16_t b1 = vqsubq_u8(a1, res);
+ vst1q_u8(*dest, b0);
+ vst1q_u8(*dest + 16, b1);
+ *dest += stride;
+}
+
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+
+ if (a1 >= 0) {
+ const uint8x16_t dc = create_dcq(a1);
+ for (i = 0; i < 32; i++) {
+ idct32x32_1_add_pos_kernel(&dest, stride, dc);
+ }
+ } else {
+ const uint8x16_t dc = create_dcq(-a1);
+ for (i = 0; i < 32; i++) {
+ idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
new file mode 100644
index 0000000000..f570547e44
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the _34_ variant. Since it only uses values from
+// the top left 8x8 it can safely assume all the remaining values are 0 and skip
+// an awful lot of calculations. In fact, only the first 6 columns make the cut.
+// None of the elements in the 7th or 8th column are used so it skips any calls
+// to input[67] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 8x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
+// coefficients as follows:
+// 0 1 2 3 4 5 6 7
+// 0 0 2 5 10 17 25
+// 1 1 4 8 15 22 30
+// 2 3 7 12 18 28
+// 3 6 11 16 23 31
+// 4 9 14 19 29
+// 5 13 20 26
+// 6 21 27 33
+// 7 24 32
+void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output) {
+ int16x8_t in[8], s1[32], s2[32], s3[32];
+
+ in[0] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[1] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[2] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[3] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[4] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[5] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[6] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[7] = load_tran_low_to_s16q(input);
+ transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ // stage 1
+ // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
+ s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+ // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
+ s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+ s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+ s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+ // stage 3
+ s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+ s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+ s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+ cospi_28_64);
+ s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+ cospi_4_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
+ s1[27], cospi_12_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
+ cospi_20_64);
+
+ s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+
+ s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
+ cospi_24_64);
+ s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+ cospi_8_64);
+
+ s2[20] = vsubq_s16(s1[23], s1[20]);
+ s2[21] = vsubq_s16(s1[22], s1[21]);
+ s2[22] = vaddq_s16(s1[21], s1[22]);
+ s2[23] = vaddq_s16(s1[20], s1[23]);
+ s2[24] = vaddq_s16(s1[24], s1[27]);
+ s2[25] = vaddq_s16(s1[25], s1[26]);
+ s2[26] = vsubq_s16(s1[25], s1[26]);
+ s2[27] = vsubq_s16(s1[24], s1[27]);
+
+ // stage 5
+ s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
+ s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
+
+ s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[17], -cospi_8_64, s1[30],
+ cospi_24_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[17], cospi_24_64, s1[30],
+ cospi_8_64);
+
+ s1[19] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_8_64, s1[31],
+ cospi_24_64);
+ s1[28] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_24_64, s1[31],
+ cospi_8_64);
+
+ s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
+ s2[27], -cospi_8_64);
+ s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
+ cospi_24_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
+ s2[26], -cospi_8_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
+ cospi_24_64);
+
+ // stage 6
+ s2[0] = vaddq_s16(s1[0], s1[7]);
+ s2[1] = vaddq_s16(s1[0], s1[6]);
+ s2[2] = vaddq_s16(s1[0], s1[5]);
+ s2[3] = vaddq_s16(s1[0], s1[4]);
+ s2[4] = vsubq_s16(s1[0], s1[4]);
+ s2[5] = vsubq_s16(s1[0], s1[5]);
+ s2[6] = vsubq_s16(s1[0], s1[6]);
+ s2[7] = vsubq_s16(s1[0], s1[7]);
+
+ s2[10] = sub_multiply_shift_and_narrow_s16(s2[14], s2[9], cospi_16_64);
+ s2[13] = add_multiply_shift_and_narrow_s16(s2[9], s2[14], cospi_16_64);
+
+ s2[11] = sub_multiply_shift_and_narrow_s16(s2[15], s2[8], cospi_16_64);
+ s2[12] = add_multiply_shift_and_narrow_s16(s2[8], s2[15], cospi_16_64);
+
+ s2[16] = vaddq_s16(s1[16], s2[23]);
+ s2[17] = vaddq_s16(s1[17], s2[22]);
+ s2[18] = vaddq_s16(s1[18], s1[21]);
+ s2[19] = vaddq_s16(s1[19], s1[20]);
+ s2[20] = vsubq_s16(s1[19], s1[20]);
+ s2[21] = vsubq_s16(s1[18], s1[21]);
+ s2[22] = vsubq_s16(s1[17], s2[22]);
+ s2[23] = vsubq_s16(s1[16], s2[23]);
+
+ s3[24] = vsubq_s16(s1[31], s2[24]);
+ s3[25] = vsubq_s16(s1[30], s2[25]);
+ s3[26] = vsubq_s16(s1[29], s1[26]);
+ s3[27] = vsubq_s16(s1[28], s1[27]);
+ s2[28] = vaddq_s16(s1[27], s1[28]);
+ s2[29] = vaddq_s16(s1[26], s1[29]);
+ s2[30] = vaddq_s16(s2[25], s1[30]);
+ s2[31] = vaddq_s16(s2[24], s1[31]);
+
+ // stage 7
+ s1[0] = vaddq_s16(s2[0], s2[15]);
+ s1[1] = vaddq_s16(s2[1], s2[14]);
+ s1[2] = vaddq_s16(s2[2], s2[13]);
+ s1[3] = vaddq_s16(s2[3], s2[12]);
+ s1[4] = vaddq_s16(s2[4], s2[11]);
+ s1[5] = vaddq_s16(s2[5], s2[10]);
+ s1[6] = vaddq_s16(s2[6], s2[9]);
+ s1[7] = vaddq_s16(s2[7], s2[8]);
+ s1[8] = vsubq_s16(s2[7], s2[8]);
+ s1[9] = vsubq_s16(s2[6], s2[9]);
+ s1[10] = vsubq_s16(s2[5], s2[10]);
+ s1[11] = vsubq_s16(s2[4], s2[11]);
+ s1[12] = vsubq_s16(s2[3], s2[12]);
+ s1[13] = vsubq_s16(s2[2], s2[13]);
+ s1[14] = vsubq_s16(s2[1], s2[14]);
+ s1[15] = vsubq_s16(s2[0], s2[15]);
+
+ s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
+ s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
+
+ s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
+ s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
+
+ s1[22] = sub_multiply_shift_and_narrow_s16(s3[25], s2[22], cospi_16_64);
+ s1[25] = add_multiply_shift_and_narrow_s16(s2[22], s3[25], cospi_16_64);
+
+ s1[23] = sub_multiply_shift_and_narrow_s16(s3[24], s2[23], cospi_16_64);
+ s1[24] = add_multiply_shift_and_narrow_s16(s2[23], s3[24], cospi_16_64);
+
+ // final stage
+ vst1q_s16(output, vaddq_s16(s1[0], s2[31]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[1], s2[30]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[2], s2[29]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[3], s2[28]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[4], s1[27]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[5], s1[26]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[6], s1[25]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[7], s1[24]));
+ output += 8;
+
+ vst1q_s16(output, vaddq_s16(s1[8], s1[23]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[9], s1[22]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[10], s1[21]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[11], s1[20]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[12], s2[19]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[13], s2[18]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[14], s2[17]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[15], s2[16]));
+ output += 8;
+
+ vst1q_s16(output, vsubq_s16(s1[15], s2[16]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[14], s2[17]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[13], s2[18]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[12], s2[19]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[11], s1[20]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[10], s1[21]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[9], s1[22]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[8], s1[23]));
+ output += 8;
+
+ vst1q_s16(output, vsubq_s16(s1[7], s1[24]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[6], s1[25]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[5], s1[26]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[4], s1[27]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[3], s2[28]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[2], s2[29]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[1], s2[30]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[0], s2[31]));
+}
+
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
+ const int highbd_flag) {
+ int16x8_t in[8], s1[32], s2[32], s3[32], out[32];
+
+ load_and_transpose_s16_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
+ &in[5], &in[6], &in[7]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+ // Different for _8_
+ s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+ s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+ s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
+
+ // stage 3
+ s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+ s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+ s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+ cospi_28_64);
+ s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+ cospi_4_64);
+
+ // Different for _8_
+ s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_28_64,
+ s1[28], -cospi_4_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_4_64, s1[28],
+ cospi_28_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
+ s1[27], cospi_12_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
+ cospi_20_64);
+
+ s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+
+ s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
+ cospi_24_64);
+ s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+ cospi_8_64);
+
+ s2[10] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_24_64,
+ s2[12], -cospi_8_64);
+ s2[13] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_8_64, s2[12],
+ cospi_24_64);
+
+ s2[16] = vaddq_s16(s1[16], s1[19]);
+
+ s2[17] = vaddq_s16(s1[17], s1[18]);
+ s2[18] = vsubq_s16(s1[17], s1[18]);
+
+ s2[19] = vsubq_s16(s1[16], s1[19]);
+
+ s2[20] = vsubq_s16(s1[23], s1[20]);
+ s2[21] = vsubq_s16(s1[22], s1[21]);
+
+ s2[22] = vaddq_s16(s1[21], s1[22]);
+ s2[23] = vaddq_s16(s1[20], s1[23]);
+
+ s2[24] = vaddq_s16(s1[24], s1[27]);
+ s2[25] = vaddq_s16(s1[25], s1[26]);
+ s2[26] = vsubq_s16(s1[25], s1[26]);
+ s2[27] = vsubq_s16(s1[24], s1[27]);
+
+ s2[28] = vsubq_s16(s1[31], s1[28]);
+ s2[29] = vsubq_s16(s1[30], s1[29]);
+ s2[30] = vaddq_s16(s1[29], s1[30]);
+ s2[31] = vaddq_s16(s1[28], s1[31]);
+
+ // stage 5
+ s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
+ s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
+
+ s1[8] = vaddq_s16(s2[8], s2[11]);
+ s1[9] = vaddq_s16(s2[9], s2[10]);
+ s1[10] = vsubq_s16(s2[9], s2[10]);
+ s1[11] = vsubq_s16(s2[8], s2[11]);
+ s1[12] = vsubq_s16(s2[15], s2[12]);
+ s1[13] = vsubq_s16(s2[14], s2[13]);
+ s1[14] = vaddq_s16(s2[13], s2[14]);
+ s1[15] = vaddq_s16(s2[12], s2[15]);
+
+ s1[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_8_64, s2[29],
+ cospi_24_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], cospi_24_64, s2[29],
+ cospi_8_64);
+
+ s1[19] = multiply_accumulate_shift_and_narrow_s16(s2[19], -cospi_8_64, s2[28],
+ cospi_24_64);
+ s1[28] = multiply_accumulate_shift_and_narrow_s16(s2[19], cospi_24_64, s2[28],
+ cospi_8_64);
+
+ s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
+ s2[27], -cospi_8_64);
+ s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
+ cospi_24_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
+ s2[26], -cospi_8_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
+ cospi_24_64);
+
+ // stage 6
+ s2[0] = vaddq_s16(s1[0], s1[7]);
+ s2[1] = vaddq_s16(s1[0], s1[6]);
+ s2[2] = vaddq_s16(s1[0], s1[5]);
+ s2[3] = vaddq_s16(s1[0], s1[4]);
+ s2[4] = vsubq_s16(s1[0], s1[4]);
+ s2[5] = vsubq_s16(s1[0], s1[5]);
+ s2[6] = vsubq_s16(s1[0], s1[6]);
+ s2[7] = vsubq_s16(s1[0], s1[7]);
+
+ s2[10] = sub_multiply_shift_and_narrow_s16(s1[13], s1[10], cospi_16_64);
+ s2[13] = add_multiply_shift_and_narrow_s16(s1[10], s1[13], cospi_16_64);
+
+ s2[11] = sub_multiply_shift_and_narrow_s16(s1[12], s1[11], cospi_16_64);
+ s2[12] = add_multiply_shift_and_narrow_s16(s1[11], s1[12], cospi_16_64);
+
+ s1[16] = vaddq_s16(s2[16], s2[23]);
+ s1[17] = vaddq_s16(s2[17], s2[22]);
+ s2[18] = vaddq_s16(s1[18], s1[21]);
+ s2[19] = vaddq_s16(s1[19], s1[20]);
+ s2[20] = vsubq_s16(s1[19], s1[20]);
+ s2[21] = vsubq_s16(s1[18], s1[21]);
+ s1[22] = vsubq_s16(s2[17], s2[22]);
+ s1[23] = vsubq_s16(s2[16], s2[23]);
+
+ s3[24] = vsubq_s16(s2[31], s2[24]);
+ s3[25] = vsubq_s16(s2[30], s2[25]);
+ s3[26] = vsubq_s16(s1[29], s1[26]);
+ s3[27] = vsubq_s16(s1[28], s1[27]);
+ s2[28] = vaddq_s16(s1[27], s1[28]);
+ s2[29] = vaddq_s16(s1[26], s1[29]);
+ s2[30] = vaddq_s16(s2[25], s2[30]);
+ s2[31] = vaddq_s16(s2[24], s2[31]);
+
+ // stage 7
+ s1[0] = vaddq_s16(s2[0], s1[15]);
+ s1[1] = vaddq_s16(s2[1], s1[14]);
+ s1[2] = vaddq_s16(s2[2], s2[13]);
+ s1[3] = vaddq_s16(s2[3], s2[12]);
+ s1[4] = vaddq_s16(s2[4], s2[11]);
+ s1[5] = vaddq_s16(s2[5], s2[10]);
+ s1[6] = vaddq_s16(s2[6], s1[9]);
+ s1[7] = vaddq_s16(s2[7], s1[8]);
+ s1[8] = vsubq_s16(s2[7], s1[8]);
+ s1[9] = vsubq_s16(s2[6], s1[9]);
+ s1[10] = vsubq_s16(s2[5], s2[10]);
+ s1[11] = vsubq_s16(s2[4], s2[11]);
+ s1[12] = vsubq_s16(s2[3], s2[12]);
+ s1[13] = vsubq_s16(s2[2], s2[13]);
+ s1[14] = vsubq_s16(s2[1], s1[14]);
+ s1[15] = vsubq_s16(s2[0], s1[15]);
+
+ s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
+ s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
+
+ s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
+ s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
+
+ s2[22] = sub_multiply_shift_and_narrow_s16(s3[25], s1[22], cospi_16_64);
+ s1[25] = add_multiply_shift_and_narrow_s16(s1[22], s3[25], cospi_16_64);
+
+ s2[23] = sub_multiply_shift_and_narrow_s16(s3[24], s1[23], cospi_16_64);
+ s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64);
+
+ // final stage
+ out[0] = final_add(s1[0], s2[31]);
+ out[1] = final_add(s1[1], s2[30]);
+ out[2] = final_add(s1[2], s2[29]);
+ out[3] = final_add(s1[3], s2[28]);
+ out[4] = final_add(s1[4], s1[27]);
+ out[5] = final_add(s1[5], s1[26]);
+ out[6] = final_add(s1[6], s1[25]);
+ out[7] = final_add(s1[7], s1[24]);
+ out[8] = final_add(s1[8], s2[23]);
+ out[9] = final_add(s1[9], s2[22]);
+ out[10] = final_add(s1[10], s1[21]);
+ out[11] = final_add(s1[11], s1[20]);
+ out[12] = final_add(s1[12], s2[19]);
+ out[13] = final_add(s1[13], s2[18]);
+ out[14] = final_add(s1[14], s1[17]);
+ out[15] = final_add(s1[15], s1[16]);
+ out[16] = final_sub(s1[15], s1[16]);
+ out[17] = final_sub(s1[14], s1[17]);
+ out[18] = final_sub(s1[13], s2[18]);
+ out[19] = final_sub(s1[12], s2[19]);
+ out[20] = final_sub(s1[11], s1[20]);
+ out[21] = final_sub(s1[10], s1[21]);
+ out[22] = final_sub(s1[9], s2[22]);
+ out[23] = final_sub(s1[8], s2[23]);
+ out[24] = final_sub(s1[7], s1[24]);
+ out[25] = final_sub(s1[6], s1[25]);
+ out[26] = final_sub(s1[5], s1[26]);
+ out[27] = final_sub(s1[4], s1[27]);
+ out[28] = final_sub(s1[3], s2[28]);
+ out[29] = final_sub(s1[2], s2[29]);
+ out[30] = final_sub(s1[1], s2[30]);
+ out[31] = final_sub(s1[0], s2[31]);
+
+ if (highbd_flag) {
+ highbd_add_and_store_bd8(out, output, stride);
+ } else {
+ uint8_t *const outputT = (uint8_t *)output;
+ add_and_store_u8_s16(out + 0, outputT, stride);
+ add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
+ add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
+ add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
+ }
+}
+
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ int16_t temp[32 * 8];
+ int16_t *t = temp;
+
+ vpx_idct32_6_neon(input, t);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_idct32_8_neon(t, dest, stride, 0);
+ t += (8 * 8);
+ dest += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
new file mode 100644
index 0000000000..9f4589ea96
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
@@ -0,0 +1,776 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_from_transformed(const int16_t *const trans_buf,
+ const int first, const int second,
+ int16x8_t *const q0,
+ int16x8_t *const q1) {
+ *q0 = vld1q_s16(trans_buf + first * 8);
+ *q1 = vld1q_s16(trans_buf + second * 8);
+}
+
+static INLINE void load_from_output(const int16_t *const out, const int first,
+ const int second, int16x8_t *const q0,
+ int16x8_t *const q1) {
+ *q0 = vld1q_s16(out + first * 32);
+ *q1 = vld1q_s16(out + second * 32);
+}
+
+static INLINE void store_in_output(int16_t *const out, const int first,
+ const int second, const int16x8_t q0,
+ const int16x8_t q1) {
+ vst1q_s16(out + first * 32, q0);
+ vst1q_s16(out + second * 32, q1);
+}
+
+static INLINE void store_combine_results(uint8_t *p1, uint8_t *p2,
+ const int stride, int16x8_t q0,
+ int16x8_t q1, int16x8_t q2,
+ int16x8_t q3) {
+ uint8x8_t d[4];
+
+ d[0] = vld1_u8(p1);
+ p1 += stride;
+ d[1] = vld1_u8(p1);
+ d[3] = vld1_u8(p2);
+ p2 -= stride;
+ d[2] = vld1_u8(p2);
+
+ q0 = vrshrq_n_s16(q0, 6);
+ q1 = vrshrq_n_s16(q1, 6);
+ q2 = vrshrq_n_s16(q2, 6);
+ q3 = vrshrq_n_s16(q3, 6);
+
+ q0 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q0), d[0]));
+ q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), d[1]));
+ q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), d[2]));
+ q3 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q3), d[3]));
+
+ d[0] = vqmovun_s16(q0);
+ d[1] = vqmovun_s16(q1);
+ d[2] = vqmovun_s16(q2);
+ d[3] = vqmovun_s16(q3);
+
+ vst1_u8(p1, d[1]);
+ p1 -= stride;
+ vst1_u8(p1, d[0]);
+ vst1_u8(p2, d[2]);
+ p2 += stride;
+ vst1_u8(p2, d[3]);
+}
+
+static INLINE void highbd_store_combine_results_bd8(uint16_t *p1, uint16_t *p2,
+ const int stride,
+ int16x8_t q0, int16x8_t q1,
+ int16x8_t q2,
+ int16x8_t q3) {
+ uint16x8_t d[4];
+
+ d[0] = vld1q_u16(p1);
+ p1 += stride;
+ d[1] = vld1q_u16(p1);
+ d[3] = vld1q_u16(p2);
+ p2 -= stride;
+ d[2] = vld1q_u16(p2);
+
+ q0 = vrshrq_n_s16(q0, 6);
+ q1 = vrshrq_n_s16(q1, 6);
+ q2 = vrshrq_n_s16(q2, 6);
+ q3 = vrshrq_n_s16(q3, 6);
+
+ q0 = vaddq_s16(q0, vreinterpretq_s16_u16(d[0]));
+ q1 = vaddq_s16(q1, vreinterpretq_s16_u16(d[1]));
+ q2 = vaddq_s16(q2, vreinterpretq_s16_u16(d[2]));
+ q3 = vaddq_s16(q3, vreinterpretq_s16_u16(d[3]));
+
+ d[0] = vmovl_u8(vqmovun_s16(q0));
+ d[1] = vmovl_u8(vqmovun_s16(q1));
+ d[2] = vmovl_u8(vqmovun_s16(q2));
+ d[3] = vmovl_u8(vqmovun_s16(q3));
+
+ vst1q_u16(p1, d[1]);
+ p1 -= stride;
+ vst1q_u16(p1, d[0]);
+ vst1q_u16(p2, d[2]);
+ p2 += stride;
+ vst1q_u16(p2, d[3]);
+}
+
+static INLINE void do_butterfly(const int16x8_t qIn0, const int16x8_t qIn1,
+ const int16_t first_const,
+ const int16_t second_const,
+ int16x8_t *const qOut0,
+ int16x8_t *const qOut1) {
+ int32x4_t q[4];
+ int16x4_t d[6];
+
+ d[0] = vget_low_s16(qIn0);
+ d[1] = vget_high_s16(qIn0);
+ d[2] = vget_low_s16(qIn1);
+ d[3] = vget_high_s16(qIn1);
+
+ // Note: using v{mul, mla, mls}l_n_s16 here slows down 35% with gcc 4.9.
+ d[4] = vdup_n_s16(first_const);
+ d[5] = vdup_n_s16(second_const);
+
+ q[0] = vmull_s16(d[0], d[4]);
+ q[1] = vmull_s16(d[1], d[4]);
+ q[0] = vmlsl_s16(q[0], d[2], d[5]);
+ q[1] = vmlsl_s16(q[1], d[3], d[5]);
+
+ q[2] = vmull_s16(d[0], d[5]);
+ q[3] = vmull_s16(d[1], d[5]);
+ q[2] = vmlal_s16(q[2], d[2], d[4]);
+ q[3] = vmlal_s16(q[3], d[3], d[4]);
+
+ *qOut0 = vcombine_s16(vrshrn_n_s32(q[0], DCT_CONST_BITS),
+ vrshrn_n_s32(q[1], DCT_CONST_BITS));
+ *qOut1 = vcombine_s16(vrshrn_n_s32(q[2], DCT_CONST_BITS),
+ vrshrn_n_s32(q[3], DCT_CONST_BITS));
+}
+
+static INLINE void load_s16x8q(const int16_t *in, int16x8_t *const s0,
+ int16x8_t *const s1, int16x8_t *const s2,
+ int16x8_t *const s3, int16x8_t *const s4,
+ int16x8_t *const s5, int16x8_t *const s6,
+ int16x8_t *const s7) {
+ *s0 = vld1q_s16(in);
+ in += 32;
+ *s1 = vld1q_s16(in);
+ in += 32;
+ *s2 = vld1q_s16(in);
+ in += 32;
+ *s3 = vld1q_s16(in);
+ in += 32;
+ *s4 = vld1q_s16(in);
+ in += 32;
+ *s5 = vld1q_s16(in);
+ in += 32;
+ *s6 = vld1q_s16(in);
+ in += 32;
+ *s7 = vld1q_s16(in);
+}
+
+static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1,
+ int16x8_t a2, int16x8_t a3,
+ int16x8_t a4, int16x8_t a5,
+ int16x8_t a6, int16x8_t a7,
+ int16_t **out) {
+ transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ vst1q_s16(*out, a0);
+ *out += 8;
+ vst1q_s16(*out, a1);
+ *out += 8;
+ vst1q_s16(*out, a2);
+ *out += 8;
+ vst1q_s16(*out, a3);
+ *out += 8;
+ vst1q_s16(*out, a4);
+ *out += 8;
+ vst1q_s16(*out, a5);
+ *out += 8;
+ vst1q_s16(*out, a6);
+ *out += 8;
+ vst1q_s16(*out, a7);
+ *out += 8;
+}
+
+static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) {
+ int i;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ for (i = 0; i < 4; i++, input += 8) {
+ load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void load_s16x8q_tran_low(
+ const tran_low_t *in, int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4,
+ int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7) {
+ *s0 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s1 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s2 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s3 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s4 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s5 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s6 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s7 = load_tran_low_to_s16q(in);
+}
+
+static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input,
+ int16_t *t_buf) {
+ int i;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ for (i = 0; i < 4; i++, input += 8) {
+ load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+ }
+}
+#else // !CONFIG_VP9_HIGHBITDEPTH
+#define idct32_transpose_pair_tran_low idct32_transpose_pair
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void idct32_bands_end_1st_pass(int16_t *const out,
+ int16x8_t *const q) {
+ store_in_output(out, 16, 17, q[6], q[7]);
+ store_in_output(out, 14, 15, q[8], q[9]);
+
+ load_from_output(out, 30, 31, &q[0], &q[1]);
+ q[4] = vaddq_s16(q[2], q[1]);
+ q[5] = vaddq_s16(q[3], q[0]);
+ q[6] = vsubq_s16(q[3], q[0]);
+ q[7] = vsubq_s16(q[2], q[1]);
+ store_in_output(out, 30, 31, q[6], q[7]);
+ store_in_output(out, 0, 1, q[4], q[5]);
+
+ load_from_output(out, 12, 13, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[10], q[1]);
+ q[3] = vaddq_s16(q[11], q[0]);
+ q[4] = vsubq_s16(q[11], q[0]);
+ q[5] = vsubq_s16(q[10], q[1]);
+
+ load_from_output(out, 18, 19, &q[0], &q[1]);
+ q[8] = vaddq_s16(q[4], q[1]);
+ q[9] = vaddq_s16(q[5], q[0]);
+ q[6] = vsubq_s16(q[5], q[0]);
+ q[7] = vsubq_s16(q[4], q[1]);
+ store_in_output(out, 18, 19, q[6], q[7]);
+ store_in_output(out, 12, 13, q[8], q[9]);
+
+ load_from_output(out, 28, 29, &q[0], &q[1]);
+ q[4] = vaddq_s16(q[2], q[1]);
+ q[5] = vaddq_s16(q[3], q[0]);
+ q[6] = vsubq_s16(q[3], q[0]);
+ q[7] = vsubq_s16(q[2], q[1]);
+ store_in_output(out, 28, 29, q[6], q[7]);
+ store_in_output(out, 2, 3, q[4], q[5]);
+
+ load_from_output(out, 10, 11, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[12], q[1]);
+ q[3] = vaddq_s16(q[13], q[0]);
+ q[4] = vsubq_s16(q[13], q[0]);
+ q[5] = vsubq_s16(q[12], q[1]);
+
+ load_from_output(out, 20, 21, &q[0], &q[1]);
+ q[8] = vaddq_s16(q[4], q[1]);
+ q[9] = vaddq_s16(q[5], q[0]);
+ q[6] = vsubq_s16(q[5], q[0]);
+ q[7] = vsubq_s16(q[4], q[1]);
+ store_in_output(out, 20, 21, q[6], q[7]);
+ store_in_output(out, 10, 11, q[8], q[9]);
+
+ load_from_output(out, 26, 27, &q[0], &q[1]);
+ q[4] = vaddq_s16(q[2], q[1]);
+ q[5] = vaddq_s16(q[3], q[0]);
+ q[6] = vsubq_s16(q[3], q[0]);
+ q[7] = vsubq_s16(q[2], q[1]);
+ store_in_output(out, 26, 27, q[6], q[7]);
+ store_in_output(out, 4, 5, q[4], q[5]);
+
+ load_from_output(out, 8, 9, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[14], q[1]);
+ q[3] = vaddq_s16(q[15], q[0]);
+ q[4] = vsubq_s16(q[15], q[0]);
+ q[5] = vsubq_s16(q[14], q[1]);
+
+ load_from_output(out, 22, 23, &q[0], &q[1]);
+ q[8] = vaddq_s16(q[4], q[1]);
+ q[9] = vaddq_s16(q[5], q[0]);
+ q[6] = vsubq_s16(q[5], q[0]);
+ q[7] = vsubq_s16(q[4], q[1]);
+ store_in_output(out, 22, 23, q[6], q[7]);
+ store_in_output(out, 8, 9, q[8], q[9]);
+
+ load_from_output(out, 24, 25, &q[0], &q[1]);
+ q[4] = vaddq_s16(q[2], q[1]);
+ q[5] = vaddq_s16(q[3], q[0]);
+ q[6] = vsubq_s16(q[3], q[0]);
+ q[7] = vsubq_s16(q[2], q[1]);
+ store_in_output(out, 24, 25, q[6], q[7]);
+ store_in_output(out, 6, 7, q[4], q[5]);
+}
+
+static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
+ uint8_t *const dest,
+ const int stride,
+ int16x8_t *const q) {
+ uint8_t *dest0 = dest + 0 * stride;
+ uint8_t *dest1 = dest + 31 * stride;
+ uint8_t *dest2 = dest + 16 * stride;
+ uint8_t *dest3 = dest + 15 * stride;
+ const int str2 = stride << 1;
+
+ store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 30, 31, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 12, 13, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[10], q[1]);
+ q[3] = vaddq_s16(q[11], q[0]);
+ q[4] = vsubq_s16(q[11], q[0]);
+ q[5] = vsubq_s16(q[10], q[1]);
+
+ load_from_output(out, 18, 19, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 28, 29, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 10, 11, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[12], q[1]);
+ q[3] = vaddq_s16(q[13], q[0]);
+ q[4] = vsubq_s16(q[13], q[0]);
+ q[5] = vsubq_s16(q[12], q[1]);
+
+ load_from_output(out, 20, 21, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 26, 27, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 8, 9, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[14], q[1]);
+ q[3] = vaddq_s16(q[15], q[0]);
+ q[4] = vsubq_s16(q[15], q[0]);
+ q[5] = vsubq_s16(q[14], q[1]);
+
+ load_from_output(out, 22, 23, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+
+ load_from_output(out, 24, 25, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+}
+
+static INLINE void highbd_idct32_bands_end_2nd_pass_bd8(
+ const int16_t *const out, uint16_t *const dest, const int stride,
+ int16x8_t *const q) {
+ uint16_t *dest0 = dest + 0 * stride;
+ uint16_t *dest1 = dest + 31 * stride;
+ uint16_t *dest2 = dest + 16 * stride;
+ uint16_t *dest3 = dest + 15 * stride;
+ const int str2 = stride << 1;
+
+ highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+ q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 30, 31, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+ q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 12, 13, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[10], q[1]);
+ q[3] = vaddq_s16(q[11], q[0]);
+ q[4] = vsubq_s16(q[11], q[0]);
+ q[5] = vsubq_s16(q[10], q[1]);
+
+ load_from_output(out, 18, 19, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+ q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 28, 29, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+ q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 10, 11, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[12], q[1]);
+ q[3] = vaddq_s16(q[13], q[0]);
+ q[4] = vsubq_s16(q[13], q[0]);
+ q[5] = vsubq_s16(q[12], q[1]);
+
+ load_from_output(out, 20, 21, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+ q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 26, 27, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+ q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 8, 9, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[14], q[1]);
+ q[3] = vaddq_s16(q[15], q[0]);
+ q[4] = vsubq_s16(q[15], q[0]);
+ q[5] = vsubq_s16(q[14], q[1]);
+
+ load_from_output(out, 22, 23, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+ q[9]);
+
+ load_from_output(out, 24, 25, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+ q[7]);
+}
+
+void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
+ const int stride, const int highbd_flag) {
+ int i, idct32_pass_loop;
+ int16_t trans_buf[32 * 8];
+ int16_t pass1[32 * 32];
+ int16_t pass2[32 * 32];
+ const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1
+ int16_t *out;
+ int16x8_t q[16];
+ uint16_t *dst = CAST_TO_SHORTPTR(dest);
+
+ for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
+ idct32_pass_loop++, out = pass2) {
+ for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop
+ if (idct32_pass_loop == 0) {
+ idct32_transpose_pair_tran_low(input, trans_buf);
+ input += 32 * 8;
+ } else {
+ idct32_transpose_pair(input_pass2, trans_buf);
+ input_pass2 += 32 * 8;
+ }
+
+ // -----------------------------------------
+ // BLOCK A: 16-19,28-31
+ // -----------------------------------------
+ // generate 16,17,30,31
+ // part of stage 1
+ load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]);
+ // part of stage 2
+ q[4] = vaddq_s16(q[0], q[1]);
+ q[13] = vsubq_s16(q[0], q[1]);
+ q[6] = vaddq_s16(q[2], q[3]);
+ q[14] = vsubq_s16(q[2], q[3]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]);
+
+ // generate 18,19,28,29
+ // part of stage 1
+ load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]);
+ // part of stage 2
+ q[13] = vsubq_s16(q[3], q[2]);
+ q[3] = vaddq_s16(q[3], q[2]);
+ q[14] = vsubq_s16(q[1], q[0]);
+ q[2] = vaddq_s16(q[1], q[0]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]);
+ // part of stage 4
+ q[8] = vaddq_s16(q[4], q[2]);
+ q[9] = vaddq_s16(q[5], q[0]);
+ q[10] = vaddq_s16(q[7], q[1]);
+ q[15] = vaddq_s16(q[6], q[3]);
+ q[13] = vsubq_s16(q[5], q[0]);
+ q[14] = vsubq_s16(q[7], q[1]);
+ store_in_output(out, 16, 31, q[8], q[15]);
+ store_in_output(out, 17, 30, q[9], q[10]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]);
+ store_in_output(out, 29, 18, q[1], q[0]);
+ // part of stage 4
+ q[13] = vsubq_s16(q[4], q[2]);
+ q[14] = vsubq_s16(q[6], q[3]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]);
+ store_in_output(out, 19, 28, q[4], q[6]);
+
+ // -----------------------------------------
+ // BLOCK B: 20-23,24-27
+ // -----------------------------------------
+ // generate 20,21,26,27
+ // part of stage 1
+ load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]);
+ // part of stage 2
+ q[13] = vsubq_s16(q[0], q[1]);
+ q[0] = vaddq_s16(q[0], q[1]);
+ q[14] = vsubq_s16(q[2], q[3]);
+ q[2] = vaddq_s16(q[2], q[3]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+
+ // generate 22,23,24,25
+ // part of stage 1
+ load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]);
+ // part of stage 2
+ q[14] = vsubq_s16(q[4], q[5]);
+ q[5] = vaddq_s16(q[4], q[5]);
+ q[13] = vsubq_s16(q[6], q[7]);
+ q[6] = vaddq_s16(q[6], q[7]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]);
+ // part of stage 4
+ q[10] = vaddq_s16(q[7], q[1]);
+ q[11] = vaddq_s16(q[5], q[0]);
+ q[12] = vaddq_s16(q[6], q[2]);
+ q[15] = vaddq_s16(q[4], q[3]);
+ // part of stage 6
+ load_from_output(out, 16, 17, &q[14], &q[13]);
+ q[8] = vaddq_s16(q[14], q[11]);
+ q[9] = vaddq_s16(q[13], q[10]);
+ q[13] = vsubq_s16(q[13], q[10]);
+ q[11] = vsubq_s16(q[14], q[11]);
+ store_in_output(out, 17, 16, q[9], q[8]);
+ load_from_output(out, 30, 31, &q[14], &q[9]);
+ q[8] = vsubq_s16(q[9], q[12]);
+ q[10] = vaddq_s16(q[14], q[15]);
+ q[14] = vsubq_s16(q[14], q[15]);
+ q[12] = vaddq_s16(q[9], q[12]);
+ store_in_output(out, 30, 31, q[10], q[12]);
+ // part of stage 7
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 25, 22, q[14], q[13]);
+ do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 24, 23, q[14], q[13]);
+ // part of stage 4
+ q[14] = vsubq_s16(q[5], q[0]);
+ q[13] = vsubq_s16(q[6], q[2]);
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]);
+ q[14] = vsubq_s16(q[7], q[1]);
+ q[13] = vsubq_s16(q[4], q[3]);
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]);
+ // part of stage 6
+ load_from_output(out, 18, 19, &q[14], &q[13]);
+ q[8] = vaddq_s16(q[14], q[1]);
+ q[9] = vaddq_s16(q[13], q[6]);
+ q[13] = vsubq_s16(q[13], q[6]);
+ q[1] = vsubq_s16(q[14], q[1]);
+ store_in_output(out, 18, 19, q[8], q[9]);
+ load_from_output(out, 28, 29, &q[8], &q[9]);
+ q[14] = vsubq_s16(q[8], q[5]);
+ q[10] = vaddq_s16(q[8], q[5]);
+ q[11] = vaddq_s16(q[9], q[0]);
+ q[0] = vsubq_s16(q[9], q[0]);
+ store_in_output(out, 28, 29, q[10], q[11]);
+ // part of stage 7
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 20, 27, q[13], q[14]);
+ do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]);
+ store_in_output(out, 21, 26, q[1], q[0]);
+
+ // -----------------------------------------
+ // BLOCK C: 8-10,11-15
+ // -----------------------------------------
+ // generate 8,9,14,15
+ // part of stage 2
+ load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]);
+ // part of stage 3
+ q[13] = vsubq_s16(q[0], q[1]);
+ q[0] = vaddq_s16(q[0], q[1]);
+ q[14] = vsubq_s16(q[2], q[3]);
+ q[2] = vaddq_s16(q[2], q[3]);
+ // part of stage 4
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]);
+
+ // generate 10,11,12,13
+ // part of stage 2
+ load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]);
+ // part of stage 3
+ q[14] = vsubq_s16(q[4], q[5]);
+ q[5] = vaddq_s16(q[4], q[5]);
+ q[13] = vsubq_s16(q[6], q[7]);
+ q[6] = vaddq_s16(q[6], q[7]);
+ // part of stage 4
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]);
+ // part of stage 5
+ q[8] = vaddq_s16(q[0], q[5]);
+ q[9] = vaddq_s16(q[1], q[7]);
+ q[13] = vsubq_s16(q[1], q[7]);
+ q[14] = vsubq_s16(q[3], q[4]);
+ q[10] = vaddq_s16(q[3], q[4]);
+ q[15] = vaddq_s16(q[2], q[6]);
+ store_in_output(out, 8, 15, q[8], q[15]);
+ store_in_output(out, 9, 14, q[9], q[10]);
+ // part of stage 6
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+ store_in_output(out, 13, 10, q[3], q[1]);
+ q[13] = vsubq_s16(q[0], q[5]);
+ q[14] = vsubq_s16(q[2], q[6]);
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+ store_in_output(out, 11, 12, q[1], q[3]);
+
+ // -----------------------------------------
+ // BLOCK D: 0-3,4-7
+ // -----------------------------------------
+ // generate 4,5,6,7
+ // part of stage 3
+ load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+ // part of stage 4
+ q[13] = vsubq_s16(q[0], q[1]);
+ q[0] = vaddq_s16(q[0], q[1]);
+ q[14] = vsubq_s16(q[2], q[3]);
+ q[2] = vaddq_s16(q[2], q[3]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+
+ // generate 0,1,2,3
+ // part of stage 4
+ load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]);
+ // part of stage 5
+ q[4] = vaddq_s16(q[7], q[6]);
+ q[7] = vsubq_s16(q[7], q[6]);
+ q[6] = vsubq_s16(q[5], q[14]);
+ q[5] = vaddq_s16(q[5], q[14]);
+ // part of stage 6
+ q[8] = vaddq_s16(q[4], q[2]);
+ q[9] = vaddq_s16(q[5], q[3]);
+ q[10] = vaddq_s16(q[6], q[1]);
+ q[11] = vaddq_s16(q[7], q[0]);
+ q[12] = vsubq_s16(q[7], q[0]);
+ q[13] = vsubq_s16(q[6], q[1]);
+ q[14] = vsubq_s16(q[5], q[3]);
+ q[15] = vsubq_s16(q[4], q[2]);
+ // part of stage 7
+ load_from_output(out, 14, 15, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[8], q[1]);
+ q[3] = vaddq_s16(q[9], q[0]);
+ q[4] = vsubq_s16(q[9], q[0]);
+ q[5] = vsubq_s16(q[8], q[1]);
+ load_from_output(out, 16, 17, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+
+ if (idct32_pass_loop == 0) {
+ idct32_bands_end_1st_pass(out, q);
+ } else {
+ if (highbd_flag) {
+ highbd_idct32_bands_end_2nd_pass_bd8(out, dst, stride, q);
+ dst += 8;
+ } else {
+ idct32_bands_end_2nd_pass(out, dest, stride, q);
+ dest += 8;
+ }
+ }
+ }
+ }
+}
+
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ vpx_idct32_32_neon(input, dest, stride, 0);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
new file mode 100644
index 0000000000..d83421e9e6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
@@ -0,0 +1,66 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vpx_idct4x4_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int stride)
+
+|vpx_idct4x4_1_add_neon| PROC
+ ldrsh r0, [r0]
+
+ ; cospi_16_64 = 11585
+ movw r12, #0x2d41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 4)
+ add r0, r0, #8 ; + (1 <<((4) - 1))
+ asr r0, r0, #4 ; >> 4
+
+ vdup.s16 q0, r0 ; duplicate a1
+
+ vld1.32 {d2[0]}, [r1], r2
+ vld1.32 {d2[1]}, [r1], r2
+ vld1.32 {d4[0]}, [r1], r2
+ vld1.32 {d4[1]}, [r1]
+
+ vaddw.u8 q8, q0, d2 ; dest[x] + a1
+ vaddw.u8 q9, q0, d4
+
+ vqmovun.s16 d6, q8 ; clip_pixel
+ vqmovun.s16 d7, q9
+
+ vst1.32 {d6[0]}, [r12], r2
+ vst1.32 {d6[1]}, [r12], r2
+ vst1.32 {d7[0]}, [r12], r2
+ vst1.32 {d7[1]}, [r12]
+
+ bx lr
+ ENDP ; |vpx_idct4x4_1_add_neon|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
new file mode 100644
index 0000000000..a14b895431
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
+ const int16x8_t res,
+ uint32x2_t *const d) {
+ uint16x8_t a;
+ uint8x8_t b;
+ *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0);
+ *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1);
+ a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d));
+ b = vqmovun_s16(vreinterpretq_s16_u16(a));
+ vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0);
+ *dest += stride;
+ vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1);
+ *dest += stride;
+}
+
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ uint32x2_t d = vdup_n_u32(0);
+
+ assert(!((intptr_t)dest % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ idct4x4_1_add_kernel(&dest, stride, dc, &d);
+ idct4x4_1_add_kernel(&dest, stride, dc, &d);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
new file mode 100644
index 0000000000..175ba7fbc2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -0,0 +1,188 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_idct4x4_16_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ INCLUDE vpx_dsp/arm/idct_neon.asm.S
+
+ AREA Block, CODE, READONLY
+;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int stride)
+
+|vpx_idct4x4_16_add_neon| PROC
+
+ ; The 2D transform is done with two passes which are actually pretty
+ ; similar. We first transform the rows. This is done by transposing
+ ; the inputs, doing an SIMD column transform (the columns are the
+ ; transposed rows) and then transpose the results (so that it goes back
+ ; in normal/row positions). Then, we transform the columns by doing
+ ; another SIMD column transform.
+ ; So, two passes of a transpose followed by a column transform.
+
+ ; load the inputs into q8-q9, d16-d19
+ LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
+
+ ; generate scalar constants
+ ; cospi_8_64 = 15137
+ movw r0, #0x3b21
+ ; cospi_16_64 = 11585
+ movw r3, #0x2d41
+ ; cospi_24_64 = 6270
+ movw r12, #0x187e
+
+ ; transpose the input data
+ ; 00 01 02 03 d16
+ ; 10 11 12 13 d17
+ ; 20 21 22 23 d18
+ ; 30 31 32 33 d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+
+ ; generate constant vectors
+ vdup.16 d20, r0 ; replicate cospi_8_64
+ vdup.16 d21, r3 ; replicate cospi_16_64
+
+ ; 00 10 02 12 d16
+ ; 01 11 03 13 d17
+ ; 20 30 22 32 d18
+ ; 21 31 23 33 d19
+ vtrn.32 q8, q9
+ ; 00 10 20 30 d16
+ ; 01 11 21 31 d17
+ ; 02 12 22 32 d18
+ ; 03 13 23 33 d19
+
+ vdup.16 d22, r12 ; replicate cospi_24_64
+
+ ; do the transform on transposed rows
+
+ ; stage 1
+ vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
+ vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
+
+ ; (input[0] + input[2]) * cospi_16_64;
+ ; (input[0] - input[2]) * cospi_16_64;
+ vmull.s16 q8, d16, d21
+ vmull.s16 q14, d18, d21
+ vadd.s32 q13, q8, q14
+ vsub.s32 q14, q8, q14
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ vmlsl.s16 q15, d19, d20
+ vmlal.s16 q1, d19, d22
+
+ ; dct_const_round_shift
+ vrshrn.s32 d26, q13, #14
+ vrshrn.s32 d27, q14, #14
+ vrshrn.s32 d29, q15, #14
+ vrshrn.s32 d28, q1, #14
+
+ ; stage 2
+ ; output[0] = step[0] + step[3];
+ ; output[1] = step[1] + step[2];
+ ; output[3] = step[0] - step[3];
+ ; output[2] = step[1] - step[2];
+ vadd.s16 q8, q13, q14
+ vsub.s16 q9, q13, q14
+ vswp d18, d19
+
+ ; transpose the results
+ ; 00 01 02 03 d16
+ ; 10 11 12 13 d17
+ ; 20 21 22 23 d18
+ ; 30 31 32 33 d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+ ; 00 10 02 12 d16
+ ; 01 11 03 13 d17
+ ; 20 30 22 32 d18
+ ; 21 31 23 33 d19
+ vtrn.32 q8, q9
+ ; 00 10 20 30 d16
+ ; 01 11 21 31 d17
+ ; 02 12 22 32 d18
+ ; 03 13 23 33 d19
+
+ ; do the transform on columns
+
+ ; stage 1
+ vadd.s16 d23, d16, d18 ; (input[0] + input[2])
+ vsub.s16 d24, d16, d18 ; (input[0] - input[2])
+
+ vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
+ vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
+
+ ; (input[0] + input[2]) * cospi_16_64;
+ ; (input[0] - input[2]) * cospi_16_64;
+ vmull.s16 q13, d23, d21
+ vmull.s16 q14, d24, d21
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ vmlsl.s16 q15, d19, d20
+ vmlal.s16 q1, d19, d22
+
+ ; dct_const_round_shift
+ vrshrn.s32 d26, q13, #14
+ vrshrn.s32 d27, q14, #14
+ vrshrn.s32 d29, q15, #14
+ vrshrn.s32 d28, q1, #14
+
+ ; stage 2
+ ; output[0] = step[0] + step[3];
+ ; output[1] = step[1] + step[2];
+ ; output[3] = step[0] - step[3];
+ ; output[2] = step[1] - step[2];
+ vadd.s16 q8, q13, q14
+ vsub.s16 q9, q13, q14
+
+ ; The results are in two registers, one of them being swapped. This will
+ ; be taken care of by loading the 'dest' value in a swapped fashion and
+ ; also storing them in the same swapped fashion.
+ ; temp_out[0, 1] = d16, d17 = q8
+ ; temp_out[2, 3] = d19, d18 = q9 swapped
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+ vrshr.s16 q8, q8, #4
+ vrshr.s16 q9, q9, #4
+
+ vld1.32 {d26[0]}, [r1], r2
+ vld1.32 {d26[1]}, [r1], r2
+ vld1.32 {d27[1]}, [r1], r2
+ vld1.32 {d27[0]}, [r1] ; no post-increment
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]
+ vaddw.u8 q8, q8, d26
+ vaddw.u8 q9, q9, d27
+
+ ; clip_pixel
+ vqmovun.s16 d26, q8
+ vqmovun.s16 d27, q9
+
+ ; do the stores in reverse order with negative post-increment, by changing
+ ; the sign of the stride
+ rsb r2, r2, #0
+ vst1.32 {d27[0]}, [r1], r2
+ vst1.32 {d27[1]}, [r1], r2
+ vst1.32 {d26[1]}, [r1], r2
+ vst1.32 {d26[0]}, [r1] ; no post-increment
+ bx lr
+ ENDP ; |vpx_idct4x4_16_add_neon|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
new file mode 100644
index 0000000000..8192ee4cf8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const uint8_t *dst = dest;
+ uint32x2_t s32 = vdup_n_u32(0);
+ int16x8_t a[2];
+ uint8x8_t s, d[2];
+ uint16x8_t sum[2];
+
+ assert(!((intptr_t)dest % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ // Rows
+ a[0] = load_tran_low_to_s16q(input);
+ a[1] = load_tran_low_to_s16q(input + 8);
+ transpose_idct4x4_16_bd8(a);
+
+ // Columns
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+ transpose_idct4x4_16_bd8(a);
+ a[0] = vrshrq_n_s16(a[0], 4);
+ a[1] = vrshrq_n_s16(a[1], 4);
+
+ s = load_u8(dst, stride);
+ dst += 2 * stride;
+ // The elements are loaded in reverse order.
+ s32 = vld1_lane_u32((const uint32_t *)dst, s32, 1);
+ dst += stride;
+ s32 = vld1_lane_u32((const uint32_t *)dst, s32, 0);
+
+ sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s);
+ sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), vreinterpret_u8_u32(s32));
+ d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
+ d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
+
+ store_u8(dest, stride, d[0]);
+ dest += 2 * stride;
+ // The elements are stored in reverse order.
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 1);
+ dest += stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 0);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
new file mode 100644
index 0000000000..ce9b459589
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE uint8x8_t create_dcd(const int16_t dc) {
+ int16x8_t t = vdupq_n_s16(dc);
+ return vqmovun_s16(t);
+}
+
+static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x8_t res) {
+ const uint8x8_t a = vld1_u8(*dest);
+ const uint8x8_t b = vqadd_u8(a, res);
+ vst1_u8(*dest, b);
+ *dest += stride;
+}
+
+static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x8_t res) {
+ const uint8x8_t a = vld1_u8(*dest);
+ const uint8x8_t b = vqsub_u8(a, res);
+ vst1_u8(*dest, b);
+ *dest += stride;
+}
+
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
+
+ if (a1 >= 0) {
+ const uint8x8_t dc = create_dcd(a1);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ } else {
+ const uint8x8_t dc = create_dcd(-a1);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
new file mode 100644
index 0000000000..7471387e47
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
+ int16x8_t a[8];
+
+ a[0] = load_tran_low_to_s16q(input);
+ a[1] = load_tran_low_to_s16q(input + 8);
+ a[2] = load_tran_low_to_s16q(input + 16);
+ a[3] = load_tran_low_to_s16q(input + 24);
+ a[4] = load_tran_low_to_s16q(input + 32);
+ a[5] = load_tran_low_to_s16q(input + 40);
+ a[6] = load_tran_low_to_s16q(input + 48);
+ a[7] = load_tran_low_to_s16q(input + 56);
+
+ idct8x8_64_1d_bd8(cospis0, cospis1, a);
+ idct8x8_64_1d_bd8(cospis0, cospis1, a);
+ idct8x8_add8x8_neon(a, dest, stride);
+}
+
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x8_t cospisd = vaddq_s16(cospis, cospis);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24
+ const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28
+ int16x4_t a[8];
+ int16x8_t b[8];
+
+ a[0] = load_tran_low_to_s16d(input);
+ a[1] = load_tran_low_to_s16d(input + 8);
+ a[2] = load_tran_low_to_s16d(input + 16);
+ a[3] = load_tran_low_to_s16d(input + 24);
+
+ idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, a);
+ idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a, b);
+ idct8x8_add8x8_neon(b, dest, stride);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm
new file mode 100644
index 0000000000..5dd9bdc788
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm
@@ -0,0 +1,46 @@
+;
+; Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ INCLUDE ./vpx_config.asm
+
+ ; Helper functions used to load tran_low_t into int16, narrowing if
+ ; necessary.
+
+ ; $dst0..3 are d registers with the pairs assumed to be contiguous in
+ ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth.
+ MACRO
+ LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src
+ IF CONFIG_VP9_HIGHBITDEPTH
+ vld1.s32 {q0,q1}, [$src]!
+ vld1.s32 {q2,q3}, [$src]!
+ vmovn.i32 $dst0, q0
+ vmovn.i32 $dst1, q1
+ vmovn.i32 $dst2, q2
+ vmovn.i32 $dst3, q3
+ ELSE
+ vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]!
+ ENDIF
+ MEND
+
+ ; $dst0..3 are d registers. q0-q3 are used as temporaries in high-bitdepth.
+ MACRO
+ LOAD_TRAN_LOW_TO_S16X2 $dst0, $dst1, $dst2, $dst3, $src
+ IF CONFIG_VP9_HIGHBITDEPTH
+ vld2.s32 {q0,q1}, [$src]!
+ vld2.s32 {q2,q3}, [$src]!
+ vmovn.i32 $dst0, q0
+ vmovn.i32 $dst1, q2
+ vmovn.i32 $dst2, q1
+ vmovn.i32 $dst3, q3
+ ELSE
+ vld2.s16 {$dst0,$dst1,$dst2,$dst3}, [$src]!
+ ENDIF
+ MEND
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h
new file mode 100644
index 0000000000..c02311326b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h
@@ -0,0 +1,919 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_IDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_IDCT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static const int16_t kCospi[16] = {
+ 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
+ 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
+ 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
+ -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */,
+ 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */,
+ 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */,
+ 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */,
+ 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
+};
+
+static const int32_t kCospi32[16] = {
+ 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
+ 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
+ 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
+ -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */,
+ 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */,
+ 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */,
+ 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */,
+ 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
+};
+
+//------------------------------------------------------------------------------
+// Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth
+static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ return vqaddq_s16(a, b);
+#else
+ return vaddq_s16(a, b);
+#endif
+}
+
+static INLINE int16x8_t final_sub(const int16x8_t a, const int16x8_t b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ return vqsubq_s16(a, b);
+#else
+ return vsubq_s16(a, b);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0,
+ const int32x4x2_t s1) {
+ int32x4x2_t t;
+ t.val[0] = vaddq_s32(s0.val[0], s1.val[0]);
+ t.val[1] = vaddq_s32(s0.val[1], s1.val[1]);
+ return t;
+}
+
+static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0,
+ const int32x4x2_t s1) {
+ int32x4x2_t t;
+ t.val[0] = vsubq_s32(s0.val[0], s1.val[0]);
+ t.val[1] = vsubq_s32(s0.val[1], s1.val[1]);
+ return t;
+}
+
+//------------------------------------------------------------------------------
+
+static INLINE int16x8_t dct_const_round_shift_low_8(const int32x4_t *const in) {
+ return vcombine_s16(vrshrn_n_s32(in[0], DCT_CONST_BITS),
+ vrshrn_n_s32(in[1], DCT_CONST_BITS));
+}
+
+static INLINE void dct_const_round_shift_low_8_dual(const int32x4_t *const t32,
+ int16x8_t *const d0,
+ int16x8_t *const d1) {
+ *d0 = dct_const_round_shift_low_8(t32 + 0);
+ *d1 = dct_const_round_shift_low_8(t32 + 2);
+}
+
+static INLINE int32x4x2_t
+dct_const_round_shift_high_4x2(const int64x2_t *const in) {
+ int32x4x2_t out;
+ out.val[0] = vcombine_s32(vrshrn_n_s64(in[0], DCT_CONST_BITS),
+ vrshrn_n_s64(in[1], DCT_CONST_BITS));
+ out.val[1] = vcombine_s32(vrshrn_n_s64(in[2], DCT_CONST_BITS),
+ vrshrn_n_s64(in[3], DCT_CONST_BITS));
+ return out;
+}
+
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
+static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
+ const int16_t a_const) {
+ // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed
+ // streams. See WRAPLOW and dct_const_round_shift for details.
+ // This instruction doubles the result and returns the high half, essentially
+ // resulting in a right shift by 15. By multiplying the constant first that
+ // becomes a right shift by DCT_CONST_BITS.
+ // The largest possible value used here is
+ // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
+ // within the range of int16_t (+32767 / -32768) even when negated.
+ return vqrdmulhq_n_s16(a, a_const * 2);
+}
+
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
+static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
+ const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+ // In both add_ and it's pair, sub_, the input for well-formed streams will be
+ // well within 16 bits (input to the idct is the difference between two frames
+ // and will be within -255 to 255, or 9 bits)
+ // However, for inputs over about 25,000 (valid for int16_t, but not for idct
+ // input) this function can not use vaddq_s16.
+ // In order to match existing behavior and intentionally out of range tests,
+ // expand the addition up to 32 bits to prevent truncation.
+ int32x4_t t[2];
+ t[0] = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+ t[1] = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+ t[0] = vmulq_n_s32(t[0], ab_const);
+ t[1] = vmulq_n_s32(t[1], ab_const);
+ return dct_const_round_shift_low_8(t);
+}
+
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
+ const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+ int32x4_t t[2];
+ t[0] = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+ t[1] = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+ t[0] = vmulq_n_s32(t[0], ab_const);
+ t[1] = vmulq_n_s32(t[1], ab_const);
+ return dct_const_round_shift_low_8(t);
+}
+
+// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
+ const int16x8_t a, const int16_t a_const, const int16x8_t b,
+ const int16_t b_const) {
+ int32x4_t t[2];
+ t[0] = vmull_n_s16(vget_low_s16(a), a_const);
+ t[1] = vmull_n_s16(vget_high_s16(a), a_const);
+ t[0] = vmlal_n_s16(t[0], vget_low_s16(b), b_const);
+ t[1] = vmlal_n_s16(t[1], vget_high_s16(b), b_const);
+ return dct_const_round_shift_low_8(t);
+}
+
+//------------------------------------------------------------------------------
+
+// Note: The following 4 functions could use 32-bit operations for bit-depth 10.
+// However, although it's 20% faster with gcc, it's 20% slower with clang.
+// Use 64-bit operations for now.
+
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
+static INLINE int32x4x2_t
+multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) {
+ int64x2_t b[4];
+
+ b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
+ b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
+ b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
+ b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
+ return dct_const_round_shift_high_4x2(b);
+}
+
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
+static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual(
+ const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
+ int32x4_t t[2];
+ int64x2_t c[4];
+
+ t[0] = vaddq_s32(a.val[0], b.val[0]);
+ t[1] = vaddq_s32(a.val[1], b.val[1]);
+ c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const);
+ c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const);
+ c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const);
+ c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const);
+ return dct_const_round_shift_high_4x2(c);
+}
+
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual(
+ const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
+ int32x4_t t[2];
+ int64x2_t c[4];
+
+ t[0] = vsubq_s32(a.val[0], b.val[0]);
+ t[1] = vsubq_s32(a.val[1], b.val[1]);
+ c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const);
+ c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const);
+ c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const);
+ c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const);
+ return dct_const_round_shift_high_4x2(c);
+}
+
+// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual(
+ const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b,
+ const int32_t b_const) {
+ int64x2_t c[4];
+ c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
+ c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
+ c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
+ c[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
+ c[0] = vmlal_n_s32(c[0], vget_low_s32(b.val[0]), b_const);
+ c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const);
+ c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const);
+ c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const);
+ return dct_const_round_shift_high_4x2(c);
+}
+
+// Shift the output down by 6 and add it to the destination buffer.
+static INLINE void add_and_store_u8_s16(const int16x8_t *const a, uint8_t *d,
+ const int stride) {
+ uint8x8_t b[8];
+ int16x8_t c[8];
+
+ b[0] = vld1_u8(d);
+ d += stride;
+ b[1] = vld1_u8(d);
+ d += stride;
+ b[2] = vld1_u8(d);
+ d += stride;
+ b[3] = vld1_u8(d);
+ d += stride;
+ b[4] = vld1_u8(d);
+ d += stride;
+ b[5] = vld1_u8(d);
+ d += stride;
+ b[6] = vld1_u8(d);
+ d += stride;
+ b[7] = vld1_u8(d);
+ d -= (7 * stride);
+
+ // c = b + (a >> 6)
+ c[0] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[0])), a[0], 6);
+ c[1] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[1])), a[1], 6);
+ c[2] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[2])), a[2], 6);
+ c[3] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[3])), a[3], 6);
+ c[4] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[4])), a[4], 6);
+ c[5] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[5])), a[5], 6);
+ c[6] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[6])), a[6], 6);
+ c[7] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[7])), a[7], 6);
+
+ b[0] = vqmovun_s16(c[0]);
+ b[1] = vqmovun_s16(c[1]);
+ b[2] = vqmovun_s16(c[2]);
+ b[3] = vqmovun_s16(c[3]);
+ b[4] = vqmovun_s16(c[4]);
+ b[5] = vqmovun_s16(c[5]);
+ b[6] = vqmovun_s16(c[6]);
+ b[7] = vqmovun_s16(c[7]);
+
+ vst1_u8(d, b[0]);
+ d += stride;
+ vst1_u8(d, b[1]);
+ d += stride;
+ vst1_u8(d, b[2]);
+ d += stride;
+ vst1_u8(d, b[3]);
+ d += stride;
+ vst1_u8(d, b[4]);
+ d += stride;
+ vst1_u8(d, b[5]);
+ d += stride;
+ vst1_u8(d, b[6]);
+ d += stride;
+ vst1_u8(d, b[7]);
+}
+
+static INLINE uint8x16_t create_dcq(const int16_t dc) {
+ // Clip both sides and gcc may compile to assembly 'usat'.
+ const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc);
+ return vdupq_n_u8((uint8_t)t);
+}
+
+static INLINE void idct4x4_16_kernel_bd8(int16x8_t *const a) {
+ const int16x4_t cospis = vld1_s16(kCospi);
+ int16x4_t b[4];
+ int32x4_t c[4];
+ int16x8_t d[2];
+
+ b[0] = vget_low_s16(a[0]);
+ b[1] = vget_high_s16(a[0]);
+ b[2] = vget_low_s16(a[1]);
+ b[3] = vget_high_s16(a[1]);
+ c[0] = vmull_lane_s16(b[0], cospis, 2);
+ c[2] = vmull_lane_s16(b[1], cospis, 2);
+ c[1] = vsubq_s32(c[0], c[2]);
+ c[0] = vaddq_s32(c[0], c[2]);
+ c[3] = vmull_lane_s16(b[2], cospis, 3);
+ c[2] = vmull_lane_s16(b[2], cospis, 1);
+ c[3] = vmlsl_lane_s16(c[3], b[3], cospis, 1);
+ c[2] = vmlal_lane_s16(c[2], b[3], cospis, 3);
+ dct_const_round_shift_low_8_dual(c, &d[0], &d[1]);
+ a[0] = vaddq_s16(d[0], d[1]);
+ a[1] = vsubq_s16(d[0], d[1]);
+}
+
+static INLINE void transpose_idct4x4_16_bd8(int16x8_t *const a) {
+ transpose_s16_4x4q(&a[0], &a[1]);
+ idct4x4_16_kernel_bd8(a);
+}
+
+static INLINE void idct8x8_12_pass1_bd8(const int16x4_t cospis0,
+ const int16x4_t cospisd0,
+ const int16x4_t cospisd1,
+ int16x4_t *const io) {
+ int16x4_t step1[8], step2[8];
+ int32x4_t t32[2];
+
+ transpose_s16_4x4d(&io[0], &io[1], &io[2], &io[3]);
+
+ // stage 1
+ step1[4] = vqrdmulh_lane_s16(io[1], cospisd1, 3);
+ step1[5] = vqrdmulh_lane_s16(io[3], cospisd1, 2);
+ step1[6] = vqrdmulh_lane_s16(io[3], cospisd1, 1);
+ step1[7] = vqrdmulh_lane_s16(io[1], cospisd1, 0);
+
+ // stage 2
+ step2[1] = vqrdmulh_lane_s16(io[0], cospisd0, 2);
+ step2[2] = vqrdmulh_lane_s16(io[2], cospisd0, 3);
+ step2[3] = vqrdmulh_lane_s16(io[2], cospisd0, 1);
+
+ step2[4] = vadd_s16(step1[4], step1[5]);
+ step2[5] = vsub_s16(step1[4], step1[5]);
+ step2[6] = vsub_s16(step1[7], step1[6]);
+ step2[7] = vadd_s16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vadd_s16(step2[1], step2[3]);
+ step1[1] = vadd_s16(step2[1], step2[2]);
+ step1[2] = vsub_s16(step2[1], step2[2]);
+ step1[3] = vsub_s16(step2[1], step2[3]);
+
+ t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
+ t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
+ t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
+ step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+
+ // stage 4
+ io[0] = vadd_s16(step1[0], step2[7]);
+ io[1] = vadd_s16(step1[1], step1[6]);
+ io[2] = vadd_s16(step1[2], step1[5]);
+ io[3] = vadd_s16(step1[3], step2[4]);
+ io[4] = vsub_s16(step1[3], step2[4]);
+ io[5] = vsub_s16(step1[2], step1[5]);
+ io[6] = vsub_s16(step1[1], step1[6]);
+ io[7] = vsub_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_pass2_bd8(const int16x4_t cospis0,
+ const int16x4_t cospisd0,
+ const int16x4_t cospisd1,
+ const int16x4_t *const input,
+ int16x8_t *const output) {
+ int16x8_t in[4];
+ int16x8_t step1[8], step2[8];
+ int32x4_t t32[8];
+
+ transpose_s16_4x8(input[0], input[1], input[2], input[3], input[4], input[5],
+ input[6], input[7], &in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3);
+ step1[5] = vqrdmulhq_lane_s16(in[3], cospisd1, 2);
+ step1[6] = vqrdmulhq_lane_s16(in[3], cospisd1, 1);
+ step1[7] = vqrdmulhq_lane_s16(in[1], cospisd1, 0);
+
+ // stage 2
+ step2[1] = vqrdmulhq_lane_s16(in[0], cospisd0, 2);
+ step2[2] = vqrdmulhq_lane_s16(in[2], cospisd0, 3);
+ step2[3] = vqrdmulhq_lane_s16(in[2], cospisd0, 1);
+
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s16(step2[1], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[1], step2[3]);
+
+ t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
+ t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
+ t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]);
+
+ // stage 4
+ output[0] = vaddq_s16(step1[0], step2[7]);
+ output[1] = vaddq_s16(step1[1], step1[6]);
+ output[2] = vaddq_s16(step1[2], step1[5]);
+ output[3] = vaddq_s16(step1[3], step2[4]);
+ output[4] = vsubq_s16(step1[3], step2[4]);
+ output[5] = vsubq_s16(step1[2], step1[5]);
+ output[6] = vsubq_s16(step1[1], step1[6]);
+ output[7] = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_1d_bd8_kernel(const int16x4_t cospis0,
+ const int16x4_t cospis1,
+ int16x8_t *const io) {
+ int16x4_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
+ input7h;
+ int16x4_t step1l[4], step1h[4];
+ int16x8_t step1[8], step2[8];
+ int32x4_t t32[8];
+
+ // stage 1
+ input1l = vget_low_s16(io[1]);
+ input1h = vget_high_s16(io[1]);
+ input3l = vget_low_s16(io[3]);
+ input3h = vget_high_s16(io[3]);
+ input5l = vget_low_s16(io[5]);
+ input5h = vget_high_s16(io[5]);
+ input7l = vget_low_s16(io[7]);
+ input7h = vget_high_s16(io[7]);
+ step1l[0] = vget_low_s16(io[0]);
+ step1h[0] = vget_high_s16(io[0]);
+ step1l[1] = vget_low_s16(io[2]);
+ step1h[1] = vget_high_s16(io[2]);
+ step1l[2] = vget_low_s16(io[4]);
+ step1h[2] = vget_high_s16(io[4]);
+ step1l[3] = vget_low_s16(io[6]);
+ step1h[3] = vget_high_s16(io[6]);
+
+ t32[0] = vmull_lane_s16(input1l, cospis1, 3);
+ t32[1] = vmull_lane_s16(input1h, cospis1, 3);
+ t32[2] = vmull_lane_s16(input3l, cospis1, 2);
+ t32[3] = vmull_lane_s16(input3h, cospis1, 2);
+ t32[4] = vmull_lane_s16(input3l, cospis1, 1);
+ t32[5] = vmull_lane_s16(input3h, cospis1, 1);
+ t32[6] = vmull_lane_s16(input1l, cospis1, 0);
+ t32[7] = vmull_lane_s16(input1h, cospis1, 0);
+ t32[0] = vmlsl_lane_s16(t32[0], input7l, cospis1, 0);
+ t32[1] = vmlsl_lane_s16(t32[1], input7h, cospis1, 0);
+ t32[2] = vmlal_lane_s16(t32[2], input5l, cospis1, 1);
+ t32[3] = vmlal_lane_s16(t32[3], input5h, cospis1, 1);
+ t32[4] = vmlsl_lane_s16(t32[4], input5l, cospis1, 2);
+ t32[5] = vmlsl_lane_s16(t32[5], input5h, cospis1, 2);
+ t32[6] = vmlal_lane_s16(t32[6], input7l, cospis1, 3);
+ t32[7] = vmlal_lane_s16(t32[7], input7h, cospis1, 3);
+ dct_const_round_shift_low_8_dual(&t32[0], &step1[4], &step1[5]);
+ dct_const_round_shift_low_8_dual(&t32[4], &step1[6], &step1[7]);
+
+ // stage 2
+ t32[2] = vmull_lane_s16(step1l[0], cospis0, 2);
+ t32[3] = vmull_lane_s16(step1h[0], cospis0, 2);
+ t32[4] = vmull_lane_s16(step1l[1], cospis0, 3);
+ t32[5] = vmull_lane_s16(step1h[1], cospis0, 3);
+ t32[6] = vmull_lane_s16(step1l[1], cospis0, 1);
+ t32[7] = vmull_lane_s16(step1h[1], cospis0, 1);
+ t32[0] = vmlal_lane_s16(t32[2], step1l[2], cospis0, 2);
+ t32[1] = vmlal_lane_s16(t32[3], step1h[2], cospis0, 2);
+ t32[2] = vmlsl_lane_s16(t32[2], step1l[2], cospis0, 2);
+ t32[3] = vmlsl_lane_s16(t32[3], step1h[2], cospis0, 2);
+ t32[4] = vmlsl_lane_s16(t32[4], step1l[3], cospis0, 1);
+ t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
+ t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
+ t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
+ dct_const_round_shift_low_8_dual(&t32[0], &step2[0], &step2[1]);
+ dct_const_round_shift_low_8_dual(&t32[4], &step2[2], &step2[3]);
+
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s16(step2[0], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[0], step2[3]);
+
+ t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
+ t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
+ t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]);
+
+ // stage 4
+ io[0] = vaddq_s16(step1[0], step2[7]);
+ io[1] = vaddq_s16(step1[1], step1[6]);
+ io[2] = vaddq_s16(step1[2], step1[5]);
+ io[3] = vaddq_s16(step1[3], step2[4]);
+ io[4] = vsubq_s16(step1[3], step2[4]);
+ io[5] = vsubq_s16(step1[2], step1[5]);
+ io[6] = vsubq_s16(step1[1], step1[6]);
+ io[7] = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
+ const int16x4_t cospis1,
+ int16x8_t *const io) {
+ transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6],
+ &io[7]);
+ idct8x8_64_1d_bd8_kernel(cospis0, cospis1, io);
+}
+
+static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
+ const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int32x4_t *const t32) {
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1);
+}
+
+static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x8_t *const d0,
+ int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+ t32[2] = vnegq_s32(t32[2]);
+ t32[3] = vnegq_s32(t32[3]);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x8_t *const d0,
+ int16x8_t *const d1) {
+ int32x4_t t32[6];
+
+ t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2);
+ t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2);
+ t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+ t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+ t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+ t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_2_30_10_22,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_4_12_20N_28,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_6_26N_14_18N,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 0);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 0);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 0);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 0);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 1);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_2_30_10_22,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_4_12_20N_28,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_6_26N_14_18N,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 2);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 2);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 2);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 2);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 3);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct16x16_add_stage7(const int16x8_t *const step2,
+ int16x8_t *const out) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // Use saturating add/sub to avoid overflow in 2nd pass
+ out[0] = vqaddq_s16(step2[0], step2[15]);
+ out[1] = vqaddq_s16(step2[1], step2[14]);
+ out[2] = vqaddq_s16(step2[2], step2[13]);
+ out[3] = vqaddq_s16(step2[3], step2[12]);
+ out[4] = vqaddq_s16(step2[4], step2[11]);
+ out[5] = vqaddq_s16(step2[5], step2[10]);
+ out[6] = vqaddq_s16(step2[6], step2[9]);
+ out[7] = vqaddq_s16(step2[7], step2[8]);
+ out[8] = vqsubq_s16(step2[7], step2[8]);
+ out[9] = vqsubq_s16(step2[6], step2[9]);
+ out[10] = vqsubq_s16(step2[5], step2[10]);
+ out[11] = vqsubq_s16(step2[4], step2[11]);
+ out[12] = vqsubq_s16(step2[3], step2[12]);
+ out[13] = vqsubq_s16(step2[2], step2[13]);
+ out[14] = vqsubq_s16(step2[1], step2[14]);
+ out[15] = vqsubq_s16(step2[0], step2[15]);
+#else
+ out[0] = vaddq_s16(step2[0], step2[15]);
+ out[1] = vaddq_s16(step2[1], step2[14]);
+ out[2] = vaddq_s16(step2[2], step2[13]);
+ out[3] = vaddq_s16(step2[3], step2[12]);
+ out[4] = vaddq_s16(step2[4], step2[11]);
+ out[5] = vaddq_s16(step2[5], step2[10]);
+ out[6] = vaddq_s16(step2[6], step2[9]);
+ out[7] = vaddq_s16(step2[7], step2[8]);
+ out[8] = vsubq_s16(step2[7], step2[8]);
+ out[9] = vsubq_s16(step2[6], step2[9]);
+ out[10] = vsubq_s16(step2[5], step2[10]);
+ out[11] = vsubq_s16(step2[4], step2[11]);
+ out[12] = vsubq_s16(step2[3], step2[12]);
+ out[13] = vsubq_s16(step2[2], step2[13]);
+ out[14] = vsubq_s16(step2[1], step2[14]);
+ out[15] = vsubq_s16(step2[0], step2[15]);
+#endif
+}
+
+static INLINE void idct16x16_store_pass1(const int16x8_t *const out,
+ int16_t *output) {
+ // Save the result into output
+ vst1q_s16(output, out[0]);
+ output += 16;
+ vst1q_s16(output, out[1]);
+ output += 16;
+ vst1q_s16(output, out[2]);
+ output += 16;
+ vst1q_s16(output, out[3]);
+ output += 16;
+ vst1q_s16(output, out[4]);
+ output += 16;
+ vst1q_s16(output, out[5]);
+ output += 16;
+ vst1q_s16(output, out[6]);
+ output += 16;
+ vst1q_s16(output, out[7]);
+ output += 16;
+ vst1q_s16(output, out[8]);
+ output += 16;
+ vst1q_s16(output, out[9]);
+ output += 16;
+ vst1q_s16(output, out[10]);
+ output += 16;
+ vst1q_s16(output, out[11]);
+ output += 16;
+ vst1q_s16(output, out[12]);
+ output += 16;
+ vst1q_s16(output, out[13]);
+ output += 16;
+ vst1q_s16(output, out[14]);
+ output += 16;
+ vst1q_s16(output, out[15]);
+}
+
+static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest,
+ const int stride) {
+ const uint8x8_t s = vld1_u8(*dest);
+ const int16x8_t res = vrshrq_n_s16(a, 5);
+ const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
+ vst1_u8(*dest, d);
+ *dest += stride;
+}
+
+static INLINE void idct8x8_add8x8_neon(int16x8_t *const out, uint8_t *dest,
+ const int stride) {
+ idct8x8_add8x1(out[0], &dest, stride);
+ idct8x8_add8x1(out[1], &dest, stride);
+ idct8x8_add8x1(out[2], &dest, stride);
+ idct8x8_add8x1(out[3], &dest, stride);
+ idct8x8_add8x1(out[4], &dest, stride);
+ idct8x8_add8x1(out[5], &dest, stride);
+ idct8x8_add8x1(out[6], &dest, stride);
+ idct8x8_add8x1(out[7], &dest, stride);
+}
+
+static INLINE void idct16x16_add8x1(const int16x8_t a, uint8_t **const dest,
+ const int stride) {
+ const uint8x8_t s = vld1_u8(*dest);
+ const int16x8_t res = vrshrq_n_s16(a, 6);
+ const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
+ vst1_u8(*dest, d);
+ *dest += stride;
+}
+
+static INLINE void idct16x16_add_store(const int16x8_t *const out,
+ uint8_t *dest, const int stride) {
+ // Add the result to dest
+ idct16x16_add8x1(out[0], &dest, stride);
+ idct16x16_add8x1(out[1], &dest, stride);
+ idct16x16_add8x1(out[2], &dest, stride);
+ idct16x16_add8x1(out[3], &dest, stride);
+ idct16x16_add8x1(out[4], &dest, stride);
+ idct16x16_add8x1(out[5], &dest, stride);
+ idct16x16_add8x1(out[6], &dest, stride);
+ idct16x16_add8x1(out[7], &dest, stride);
+ idct16x16_add8x1(out[8], &dest, stride);
+ idct16x16_add8x1(out[9], &dest, stride);
+ idct16x16_add8x1(out[10], &dest, stride);
+ idct16x16_add8x1(out[11], &dest, stride);
+ idct16x16_add8x1(out[12], &dest, stride);
+ idct16x16_add8x1(out[13], &dest, stride);
+ idct16x16_add8x1(out[14], &dest, stride);
+ idct16x16_add8x1(out[15], &dest, stride);
+}
+
+static INLINE void highbd_idct16x16_add8x1(const int16x8_t a,
+ const int16x8_t max,
+ uint16_t **const dest,
+ const int stride) {
+ const uint16x8_t s = vld1q_u16(*dest);
+ const int16x8_t res0 = vqaddq_s16(a, vreinterpretq_s16_u16(s));
+ const int16x8_t res1 = vminq_s16(res0, max);
+ const uint16x8_t d = vqshluq_n_s16(res1, 0);
+ vst1q_u16(*dest, d);
+ *dest += stride;
+}
+
+static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest,
+ const int stride) {
+ // Add the result to dest
+ const int16x8_t max = vdupq_n_s16((1 << 8) - 1);
+ out[0] = vrshrq_n_s16(out[0], 6);
+ out[1] = vrshrq_n_s16(out[1], 6);
+ out[2] = vrshrq_n_s16(out[2], 6);
+ out[3] = vrshrq_n_s16(out[3], 6);
+ out[4] = vrshrq_n_s16(out[4], 6);
+ out[5] = vrshrq_n_s16(out[5], 6);
+ out[6] = vrshrq_n_s16(out[6], 6);
+ out[7] = vrshrq_n_s16(out[7], 6);
+ out[8] = vrshrq_n_s16(out[8], 6);
+ out[9] = vrshrq_n_s16(out[9], 6);
+ out[10] = vrshrq_n_s16(out[10], 6);
+ out[11] = vrshrq_n_s16(out[11], 6);
+ out[12] = vrshrq_n_s16(out[12], 6);
+ out[13] = vrshrq_n_s16(out[13], 6);
+ out[14] = vrshrq_n_s16(out[14], 6);
+ out[15] = vrshrq_n_s16(out[15], 6);
+ highbd_idct16x16_add8x1(out[0], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[1], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[2], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[3], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[4], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[5], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[6], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[7], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[8], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[9], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[10], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[11], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[12], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[13], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[14], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[15], max, &dest, stride);
+}
+
+static INLINE void highbd_idct16x16_add8x1_bd8(const int16x8_t a,
+ uint16_t **const dest,
+ const int stride) {
+ const uint16x8_t s = vld1q_u16(*dest);
+ const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), a, 6);
+ const uint16x8_t d = vmovl_u8(vqmovun_s16(res));
+ vst1q_u16(*dest, d);
+ *dest += stride;
+}
+
+static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a,
+ uint16_t *out, const int stride) {
+ highbd_idct16x16_add8x1_bd8(a[0], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[1], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[2], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[3], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[4], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[5], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[6], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[7], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[8], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[9], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[10], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[11], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[12], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[13], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[14], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[15], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[16], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[17], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[18], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[19], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[20], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[21], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[22], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[23], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[24], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[25], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[26], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[27], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[28], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[29], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[30], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[31], &out, stride);
+}
+
+void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
+ void *const dest, const int stride,
+ const int highbd_flag);
+
+void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
+ void *const dest, const int stride,
+ const int highbd_flag);
+
+void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+ int16_t *output);
+
+void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
+ int16_t *const output, void *const dest,
+ const int stride, const int highbd_flag);
+
+void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
+ const int stride, const int highbd_flag);
+
+void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output);
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
+ const int stride, const int highbd_flag);
+
+void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output);
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
+ const int highbd_flag);
+
+#endif // VPX_VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c
new file mode 100644
index 0000000000..4f909e4935
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c
@@ -0,0 +1,1942 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16_t dc_sum_4(const uint8_t *ref) {
+ return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref));
+}
+
+static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
+ const uint8x8_t dc) {
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0);
+ }
+}
+
+void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t a = load_unaligned_u8_4x1(above);
+ const uint8x8_t l = load_unaligned_u8_4x1(left);
+ const uint16x4_t al = vget_low_u16(vaddl_u8(a, l));
+ const uint16_t sum = horizontal_add_uint16x4(al);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16_t sum = dc_sum_4(left);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
+ (void)above;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16_t sum = dc_sum_4(above);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
+ (void)left;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t dc = vdup_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16_t dc_sum_8(const uint8_t *ref) {
+ return horizontal_add_uint8x8(vld1_u8(ref));
+}
+
+static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
+ const uint8x8_t dc) {
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ vst1_u8(dst, dc);
+ }
+}
+
+void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t above_u8 = vld1_u8(above);
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const uint16x8_t al = vaddl_u8(above_u8, left_u8);
+ const uint16_t sum = horizontal_add_uint16x8(al);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4);
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16_t sum = dc_sum_8(left);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
+ (void)above;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16_t sum = dc_sum_8(above);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
+ (void)left;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t dc = vdup_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_8x8(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16_t dc_sum_16(const uint8_t *ref) {
+ return horizontal_add_uint8x16(vld1q_u8(ref));
+}
+
+static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
+ const uint8x16_t dc) {
+ int i;
+ for (i = 0; i < 16; ++i, dst += stride) {
+ vst1q_u8(dst + 0, dc);
+ }
+}
+
+void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t ref0 = vld1q_u8(above);
+ const uint8x16_t ref1 = vld1q_u8(left);
+ const uint16x8_t a = vpaddlq_u8(ref0);
+ const uint16x8_t l = vpaddlq_u8(ref1);
+ const uint16x8_t al = vaddq_u16(a, l);
+ const uint16_t sum = horizontal_add_uint16x8(al);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16_t sum = dc_sum_16(left);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
+ (void)above;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16_t sum = dc_sum_16(above);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
+ (void)left;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint8x16_t dc = vdupq_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_16x16(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint16_t dc_sum_32(const uint8_t *ref) {
+ const uint8x16_t r0 = vld1q_u8(ref + 0);
+ const uint8x16_t r1 = vld1q_u8(ref + 16);
+ const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1));
+ return horizontal_add_uint16x8(r01);
+}
+
+static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
+ const uint8x16_t dc) {
+ int i;
+ for (i = 0; i < 32; ++i, dst += stride) {
+ vst1q_u8(dst + 0, dc);
+ vst1q_u8(dst + 16, dc);
+ }
+}
+
+void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vld1q_u8(above + 0);
+ const uint8x16_t a1 = vld1q_u8(above + 16);
+ const uint8x16_t l0 = vld1q_u8(left + 0);
+ const uint8x16_t l1 = vld1q_u8(left + 16);
+ const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1));
+ const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1));
+ const uint16x8_t al = vaddq_u16(a01, l01);
+ const uint16_t sum = horizontal_add_uint16x8(al);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0);
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16_t sum = dc_sum_32(left);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
+ (void)above;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16_t sum = dc_sum_32(above);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
+ (void)left;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint8x16_t dc = vdupq_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_32x32(dst, stride, dc);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a0, a1, a2, d0;
+ uint8_t a7;
+ (void)left;
+
+ a0 = vld1_u8(above);
+ a7 = above[7];
+
+ // [ above[1], ..., above[6], x, x ]
+ a1 = vext_u8(a0, a0, 1);
+ // [ above[2], ..., above[7], x, x ]
+ a2 = vext_u8(a0, a0, 2);
+
+ // d0[0] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[5] = AVG3(above[5], above[6], above[7]);
+ // d0[6] = x (don't care)
+ // d0[7] = x (don't care)
+ d0 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+ // We want:
+ // stride=0 [ d0[0], d0[1], d0[2], d0[3] ]
+ // stride=1 [ d0[1], d0[2], d0[3], d0[4] ]
+ // stride=2 [ d0[2], d0[3], d0[4], d0[5] ]
+ // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+ store_u8_4x1(dst + 0 * stride, d0);
+ store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1));
+ store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2));
+ store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3));
+
+ // We stored d0[6] above, so fixup into above[7].
+ dst[3 * stride + 3] = a7;
+}
+
+void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t ax0, a0, a1, a7, d0;
+ (void)left;
+
+ a0 = vld1_u8(above + 0);
+ a1 = vld1_u8(above + 1);
+ a7 = vld1_dup_u8(above + 7);
+
+ // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+ // shift in above[7] later, so shift a0 across by one to get the right
+ // inputs:
+ // [ x, above[0], ... , above[6] ]
+ ax0 = vext_u8(a0, a0, 7);
+
+ // d0[0] = x (don't care)
+ // d0[1] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[7] = AVG3(above[6], above[7], above[8]);
+ d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0);
+
+ // Undo the earlier ext, incrementally shift in duplicates of above[7].
+ vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1));
+ vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2));
+ vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3));
+ vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4));
+ vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5));
+ vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6));
+ vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7));
+ vst1_u8(dst + 7 * stride, a7);
+}
+
+void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t ax0, a0, a1, a15, d0;
+ (void)left;
+
+ a0 = vld1q_u8(above + 0);
+ a1 = vld1q_u8(above + 1);
+ a15 = vld1q_dup_u8(above + 15);
+
+ // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+ // shift in above[15] later, so shift a0 across by one to get the right
+ // inputs:
+ // [ x, above[0], ... , above[14] ]
+ ax0 = vextq_u8(a0, a0, 15);
+
+ // d0[0] = x (don't care)
+ // d0[1] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[15] = AVG3(above[14], above[15], above[16]);
+ d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+
+ // Undo the earlier ext, incrementally shift in duplicates of above[15].
+ vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1));
+ vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2));
+ vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3));
+ vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4));
+ vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5));
+ vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6));
+ vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7));
+ vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8));
+ vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9));
+ vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10));
+ vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11));
+ vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12));
+ vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13));
+ vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14));
+ vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15));
+ vst1q_u8(dst + 15 * stride, a15);
+}
+
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2];
+ (void)left;
+
+ a0 = vld1q_u8(above + 0);
+ a1 = vld1q_u8(above + 1);
+ a15 = vld1q_u8(above + 15);
+ a16 = vld1q_u8(above + 16);
+ a17 = vld1q_u8(above + 17);
+ a31 = vld1q_dup_u8(above + 31);
+
+ // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+ // shift in above[15] later, so shift a0 across by one to get the right
+ // inputs:
+ // [ x, above[0], ... , above[14] ]
+ ax0 = vextq_u8(a0, a0, 15);
+
+ // d0[0] = x (don't care)
+ // d0[1] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[15] = AVG3(above[14], above[15], above[16]);
+ d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+ d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16);
+
+ // Undo the earlier ext, incrementally shift in duplicates of above[15].
+ vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1));
+ vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1));
+ vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2));
+ vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2));
+ vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3));
+ vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3));
+ vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4));
+ vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4));
+ vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5));
+ vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5));
+ vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6));
+ vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6));
+ vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7));
+ vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7));
+ vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8));
+ vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8));
+ vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9));
+ vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9));
+ vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10));
+ vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10));
+ vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11));
+ vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11));
+ vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12));
+ vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12));
+ vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13));
+ vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13));
+ vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14));
+ vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14));
+ vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15));
+ vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15));
+ vst1q_u8(dst + 15 * stride + 0, d0[1]);
+ vst1q_u8(dst + 15 * stride + 16, a31);
+
+ vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1));
+ vst1q_u8(dst + 16 * stride + 16, a31);
+ vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2));
+ vst1q_u8(dst + 17 * stride + 16, a31);
+ vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3));
+ vst1q_u8(dst + 18 * stride + 16, a31);
+ vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4));
+ vst1q_u8(dst + 19 * stride + 16, a31);
+ vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5));
+ vst1q_u8(dst + 20 * stride + 16, a31);
+ vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6));
+ vst1q_u8(dst + 21 * stride + 16, a31);
+ vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7));
+ vst1q_u8(dst + 22 * stride + 16, a31);
+ vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8));
+ vst1q_u8(dst + 23 * stride + 16, a31);
+ vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9));
+ vst1q_u8(dst + 24 * stride + 16, a31);
+ vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10));
+ vst1q_u8(dst + 25 * stride + 16, a31);
+ vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11));
+ vst1q_u8(dst + 26 * stride + 16, a31);
+ vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12));
+ vst1q_u8(dst + 27 * stride + 16, a31);
+ vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13));
+ vst1q_u8(dst + 28 * stride + 16, a31);
+ vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14));
+ vst1q_u8(dst + 29 * stride + 16, a31);
+ vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15));
+ vst1q_u8(dst + 30 * stride + 16, a31);
+ vst1q_u8(dst + 31 * stride + 0, a31);
+ vst1q_u8(dst + 31 * stride + 16, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3;
+ (void)left;
+
+ a0 = load_unaligned_u8_4x1(above + 0);
+ a1 = load_unaligned_u8_4x1(above + 1);
+ a2 = load_unaligned_u8_4x1(above + 2);
+ a3 = load_unaligned_u8_4x1(above + 3);
+
+ d0 = vrhadd_u8(a0, a1);
+ d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+ d2 = vrhadd_u8(a1, a2);
+ d3 = vrhadd_u8(vhadd_u8(a1, a3), a2);
+
+ store_u8_4x1(dst + 0 * stride, d0);
+ store_u8_4x1(dst + 1 * stride, d1);
+ store_u8_4x1(dst + 2 * stride, d2);
+ store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a0, a1, a2, a7, d0, d1;
+ (void)left;
+
+ a0 = vld1_u8(above + 0);
+ a1 = vld1_u8(above + 1);
+ a2 = vld1_u8(above + 2);
+ a7 = vld1_dup_u8(above + 7);
+
+ d0 = vrhadd_u8(a0, a1);
+ d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+ vst1_u8(dst + 0 * stride, d0);
+ vst1_u8(dst + 1 * stride, d1);
+
+ d0 = vext_u8(d0, d0, 7);
+ d1 = vext_u8(d1, d1, 7);
+
+ vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2));
+ vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2));
+ vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3));
+ vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3));
+ vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4));
+ vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4));
+}
+
+void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t a0, a1, a2, a15, d0, d1;
+ (void)left;
+
+ a0 = vld1q_u8(above + 0);
+ a1 = vld1q_u8(above + 1);
+ a2 = vld1q_u8(above + 2);
+ a15 = vld1q_dup_u8(above + 15);
+
+ d0 = vrhaddq_u8(a0, a1);
+ d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+
+ vst1q_u8(dst + 0 * stride, d0);
+ vst1q_u8(dst + 1 * stride, d1);
+
+ d0 = vextq_u8(d0, d0, 15);
+ d1 = vextq_u8(d1, d1, 15);
+
+ vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2));
+ vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2));
+ vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3));
+ vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3));
+ vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4));
+ vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4));
+ vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5));
+ vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5));
+ vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6));
+ vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6));
+ vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7));
+ vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7));
+ vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8));
+ vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8));
+}
+
+void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi;
+ (void)left;
+
+ a0 = vld1q_u8(above + 0);
+ a1 = vld1q_u8(above + 1);
+ a2 = vld1q_u8(above + 2);
+ a16 = vld1q_u8(above + 16);
+ a17 = vld1q_u8(above + 17);
+ a18 = vld1q_u8(above + 18);
+ a31 = vld1q_dup_u8(above + 31);
+
+ d0_lo = vrhaddq_u8(a0, a1);
+ d0_hi = vrhaddq_u8(a16, a17);
+ d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+ d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17);
+
+ vst1q_u8(dst + 0 * stride + 0, d0_lo);
+ vst1q_u8(dst + 0 * stride + 16, d0_hi);
+ vst1q_u8(dst + 1 * stride + 0, d1_lo);
+ vst1q_u8(dst + 1 * stride + 16, d1_hi);
+
+ d0_hi = vextq_u8(d0_lo, d0_hi, 15);
+ d0_lo = vextq_u8(d0_lo, d0_lo, 15);
+ d1_hi = vextq_u8(d1_lo, d1_hi, 15);
+ d1_lo = vextq_u8(d1_lo, d1_lo, 15);
+
+ vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2));
+ vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2));
+ vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2));
+ vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2));
+ vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3));
+ vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3));
+ vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3));
+ vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3));
+ vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4));
+ vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4));
+ vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4));
+ vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4));
+ vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5));
+ vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5));
+ vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5));
+ vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5));
+ vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6));
+ vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6));
+ vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6));
+ vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6));
+ vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7));
+ vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7));
+ vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7));
+ vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7));
+ vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8));
+ vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8));
+ vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8));
+ vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8));
+ vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9));
+ vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9));
+ vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9));
+ vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9));
+ vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10));
+ vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10));
+ vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10));
+ vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10));
+ vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11));
+ vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11));
+ vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11));
+ vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11));
+ vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12));
+ vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12));
+ vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12));
+ vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12));
+ vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13));
+ vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13));
+ vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13));
+ vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13));
+ vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14));
+ vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14));
+ vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14));
+ vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14));
+ vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15));
+ vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15));
+ vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15));
+ vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15));
+ vst1q_u8(dst + 30 * stride + 0, d0_hi);
+ vst1q_u8(dst + 30 * stride + 16, a31);
+ vst1q_u8(dst + 31 * stride + 0, d1_hi);
+ vst1q_u8(dst + 31 * stride + 16, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+ uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1;
+
+ az = load_unaligned_u8_4x1(above - 1);
+ a0 = load_unaligned_u8_4x1(above + 0);
+ // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+ l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+ col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2);
+ col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2);
+
+ d0 = vrhadd_u8(az, a0);
+ d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+ d2 = vext_u8(col0, d0, 7);
+ d3 = vext_u8(col1, d1, 7);
+
+ store_u8_4x1(dst + 0 * stride, d0);
+ store_u8_4x1(dst + 1 * stride, d1);
+ store_u8_4x1(dst + 2 * stride, d2);
+ store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+ az = vld1_u8(above - 1);
+ a0 = vld1_u8(above + 0);
+ // [ left[0], above[-1], ... , above[5] ]
+ l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+ l0 = vld1_u8(left + 0);
+ // The last lane here is unused, reading left[8] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[7], x ]
+ l1 = vext_u8(l0, l0, 1);
+ // [ above[-1], left[0], ... , left[6] ]
+ azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+ // d0[0] = AVG2(above[-1], above[0])
+ // d0[1] = AVG2(above[0], above[1])
+ // ...
+ // d0[7] = AVG2(above[6], above[7])
+ d0 = vrhadd_u8(az, a0);
+
+ // d1[0] = AVG3(left[0], above[-1], above[0])
+ // d1[1] = AVG3(above[-1], above[0], above[1])
+ // ...
+ // d1[7] = AVG3(above[5], above[6], above[7])
+ d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+ // The ext instruction shifts elements in from the end of the vector rather
+ // than the start, so reverse the vector to put the elements to be shifted in
+ // at the end. The lowest two lanes here are unused:
+ // col0[7] = AVG3(above[-1], left[0], left[1])
+ // col0[6] = AVG3(left[0], left[1], left[2])
+ // ...
+ // col0[2] = AVG3(left[4], left[5], left[6])
+ // col0[1] = x (don't care)
+ // col0[0] = x (don't care)
+ col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0));
+
+ // We don't care about the first parameter to this uzp since we only ever use
+ // the high three elements, we just use col0 again since it is already
+ // available:
+ // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+ // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+ col0_even = vuzp_u8(col0, col0).val[1];
+ col0_odd = vuzp_u8(col0, col0).val[0];
+
+ // Incrementally shift more elements from col0 into d0/1:
+ // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
+ // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
+ // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ]
+ // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+ // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ]
+ // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ]
+ // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+ // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+ vst1_u8(dst + 0 * stride, d0);
+ vst1_u8(dst + 1 * stride, d1);
+ vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7));
+ vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7));
+ vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6));
+ vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6));
+ vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5));
+ vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5));
+}
+
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+ uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+ az = vld1q_u8(above - 1);
+ a0 = vld1q_u8(above + 0);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+ l0 = vld1q_u8(left + 0);
+ // The last lane here is unused, reading left[16] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[15], x ]
+ l1 = vextq_u8(l0, l0, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+ d0 = vrhaddq_u8(az, a0);
+ d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+
+ col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+ col0 = vrev64q_u8(vextq_u8(col0, col0, 8));
+
+ // The low nine lanes here are unused so the first input to the uzp is
+ // unused, so just use a duplicate of col0 since we have it already. This
+ // also means that the lowest lane of col0 here is unused.
+ col0_even = vuzpq_u8(col0, col0).val[1];
+ col0_odd = vuzpq_u8(col0, col0).val[0];
+
+ vst1q_u8(dst + 0 * stride, d0);
+ vst1q_u8(dst + 1 * stride, d1);
+ vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15));
+ vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15));
+ vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14));
+ vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14));
+ vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13));
+ vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13));
+ vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12));
+ vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12));
+ vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11));
+ vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11));
+ vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10));
+ vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10));
+ vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9));
+ vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9));
+}
+
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+ uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1,
+ l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd;
+
+ az = vld1q_u8(above - 1);
+ a0 = vld1q_u8(above + 0);
+ a14 = vld1q_u8(above + 14);
+ a15 = vld1q_u8(above + 15);
+ a16 = vld1q_u8(above + 16);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+ l0 = vld1q_u8(left + 0);
+ l1 = vld1q_u8(left + 1);
+ l15 = vld1q_u8(left + 15);
+ l16 = vld1q_u8(left + 16);
+ // The last lane here is unused, reading left[32] would cause a buffer
+ // over-read (observed as an address-sanitizer failure), so just fill with a
+ // duplicate of left[16] to avoid needing to materialize a zero:
+ // [ left[17], ... , left[31], x ]
+ l17 = vextq_u8(l16, l16, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+ d0_lo = vrhaddq_u8(az, a0);
+ d0_hi = vrhaddq_u8(a15, a16);
+ d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+ d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+ // The last lane of col0_hi is unused here.
+ col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+ col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+ col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8));
+ col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8));
+
+ // The first lane of these are unused since they are only ever called as
+ // ext(col0, _, i) where i >= 1.
+ col0_even = vuzpq_u8(col0_hi, col0_lo).val[1];
+ col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0];
+
+ vst1q_u8(dst + 0 * stride + 0, d0_lo);
+ vst1q_u8(dst + 0 * stride + 16, d0_hi);
+ vst1q_u8(dst + 1 * stride + 0, d1_lo);
+ vst1q_u8(dst + 1 * stride + 16, d1_hi);
+ vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15));
+ vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15));
+ vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15));
+ vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+ vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14));
+ vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14));
+ vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14));
+ vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14));
+ vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13));
+ vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13));
+ vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13));
+ vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+ vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12));
+ vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12));
+ vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12));
+ vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12));
+ vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11));
+ vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11));
+ vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11));
+ vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+ vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10));
+ vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10));
+ vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10));
+ vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10));
+ vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9));
+ vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9));
+ vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9));
+ vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+ vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8));
+ vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8));
+ vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8));
+ vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8));
+ vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7));
+ vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7));
+ vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7));
+ vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+ vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6));
+ vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6));
+ vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6));
+ vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6));
+ vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5));
+ vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5));
+ vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5));
+ vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+ vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4));
+ vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4));
+ vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4));
+ vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4));
+ vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3));
+ vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3));
+ vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3));
+ vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+ vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2));
+ vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2));
+ vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2));
+ vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2));
+ vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1));
+ vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1));
+ vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1));
+ vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t XA0123 = vld1_u8(above - 1);
+ const uint8x8_t L0123 = vld1_u8(left);
+ const uint8x8_t L3210 = vrev64_u8(L0123);
+ const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4);
+ const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5);
+ const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1);
+ const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012);
+ const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123);
+
+ store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3));
+ store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2));
+ store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1));
+ store_u8_4x1(dst + 3 * stride, avg2);
+}
+
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t XA0123456 = vld1_u8(above - 1);
+ const uint8x8_t A01234567 = vld1_u8(above);
+ const uint8x8_t A1234567_ = vld1_u8(above + 1);
+ const uint8x8_t L01234567 = vld1_u8(left);
+ const uint8x8_t L76543210 = vrev64_u8(L01234567);
+ const uint8x8_t L6543210X = vext_u8(L76543210, XA0123456, 1);
+ const uint8x8_t L543210XA0 = vext_u8(L76543210, XA0123456, 2);
+ const uint8x16_t L76543210XA0123456 = vcombine_u8(L76543210, XA0123456);
+ const uint8x16_t L6543210XA01234567 = vcombine_u8(L6543210X, A01234567);
+ const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_);
+ const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_);
+ const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567);
+
+ vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7)));
+ vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6)));
+ vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5)));
+ vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4)));
+ vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3)));
+ vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2)));
+ vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1)));
+ vst1_u8(dst + 7 * stride, vget_low_u8(row));
+}
+
+static INLINE void d135_store_16x8(
+ uint8_t **dst, const ptrdiff_t stride, const uint8x16_t row_0,
+ const uint8x16_t row_1, const uint8x16_t row_2, const uint8x16_t row_3,
+ const uint8x16_t row_4, const uint8x16_t row_5, const uint8x16_t row_6,
+ const uint8x16_t row_7) {
+ vst1q_u8(*dst, row_0);
+ *dst += stride;
+ vst1q_u8(*dst, row_1);
+ *dst += stride;
+ vst1q_u8(*dst, row_2);
+ *dst += stride;
+ vst1q_u8(*dst, row_3);
+ *dst += stride;
+ vst1q_u8(*dst, row_4);
+ *dst += stride;
+ vst1q_u8(*dst, row_5);
+ *dst += stride;
+ vst1q_u8(*dst, row_6);
+ *dst += stride;
+ vst1q_u8(*dst, row_7);
+ *dst += stride;
+}
+
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t XA0123456789abcde = vld1q_u8(above - 1);
+ const uint8x16_t A0123456789abcdef = vld1q_u8(above);
+ const uint8x16_t A123456789abcdef_ = vld1q_u8(above + 1);
+ const uint8x16_t L0123456789abcdef = vld1q_u8(left);
+ const uint8x8_t L76543210 = vrev64_u8(vget_low_u8(L0123456789abcdef));
+ const uint8x8_t Lfedcba98 = vrev64_u8(vget_high_u8(L0123456789abcdef));
+ const uint8x16_t Lfedcba9876543210 = vcombine_u8(Lfedcba98, L76543210);
+ const uint8x16_t Ledcba9876543210X =
+ vextq_u8(Lfedcba9876543210, XA0123456789abcde, 1);
+ const uint8x16_t Ldcba9876543210XA0 =
+ vextq_u8(Lfedcba9876543210, XA0123456789abcde, 2);
+ const uint8x16_t avg_0 = vhaddq_u8(Lfedcba9876543210, Ldcba9876543210XA0);
+ const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_);
+ const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X);
+ const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef);
+
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
+ const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14);
+ const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13);
+ const uint8x16_t r_3 = vextq_u8(row_0, row_1, 12);
+ const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11);
+ const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10);
+ const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9);
+ const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8);
+ const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7);
+ const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6);
+ const uint8x16_t r_a = vextq_u8(row_0, row_1, 5);
+ const uint8x16_t r_b = vextq_u8(row_0, row_1, 4);
+ const uint8x16_t r_c = vextq_u8(row_0, row_1, 3);
+ const uint8x16_t r_d = vextq_u8(row_0, row_1, 2);
+ const uint8x16_t r_e = vextq_u8(row_0, row_1, 1);
+
+ d135_store_16x8(&dst, stride, r_0, r_1, r_2, r_3, r_4, r_5, r_6, r_7);
+ d135_store_16x8(&dst, stride, r_8, r_9, r_a, r_b, r_c, r_d, r_e, row_0);
+}
+
+static INLINE void d135_store_32x2(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x16_t row_0,
+ const uint8x16_t row_1,
+ const uint8x16_t row_2) {
+ uint8_t *dst2 = *dst;
+ vst1q_u8(dst2, row_1);
+ dst2 += 16;
+ vst1q_u8(dst2, row_2);
+ dst2 += 16 * stride - 16;
+ vst1q_u8(dst2, row_0);
+ dst2 += 16;
+ vst1q_u8(dst2, row_1);
+ *dst += stride;
+}
+
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t LL0123456789abcdef = vld1q_u8(left + 16);
+ const uint8x16_t LU0123456789abcdef = vld1q_u8(left);
+ const uint8x8_t LL76543210 = vrev64_u8(vget_low_u8(LL0123456789abcdef));
+ const uint8x8_t LU76543210 = vrev64_u8(vget_low_u8(LU0123456789abcdef));
+ const uint8x8_t LLfedcba98 = vrev64_u8(vget_high_u8(LL0123456789abcdef));
+ const uint8x8_t LUfedcba98 = vrev64_u8(vget_high_u8(LU0123456789abcdef));
+ const uint8x16_t LLfedcba9876543210 = vcombine_u8(LLfedcba98, LL76543210);
+ const uint8x16_t LUfedcba9876543210 = vcombine_u8(LUfedcba98, LU76543210);
+ const uint8x16_t LLedcba9876543210Uf =
+ vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 1);
+ const uint8x16_t LLdcba9876543210Ufe =
+ vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 2);
+ const uint8x16_t avg_0 = vhaddq_u8(LLfedcba9876543210, LLdcba9876543210Ufe);
+ const uint8x16_t row_0 = vrhaddq_u8(avg_0, LLedcba9876543210Uf);
+
+ const uint8x16_t XAL0123456789abcde = vld1q_u8(above - 1);
+ const uint8x16_t LUedcba9876543210X =
+ vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 1);
+ const uint8x16_t LUdcba9876543210XA0 =
+ vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 2);
+ const uint8x16_t avg_1 = vhaddq_u8(LUfedcba9876543210, LUdcba9876543210XA0);
+ const uint8x16_t row_1 = vrhaddq_u8(avg_1, LUedcba9876543210X);
+
+ const uint8x16_t AL0123456789abcdef = vld1q_u8(above);
+ const uint8x16_t AL123456789abcdefg = vld1q_u8(above + 1);
+ const uint8x16_t ALfR0123456789abcde = vld1q_u8(above + 15);
+ const uint8x16_t AR0123456789abcdef = vld1q_u8(above + 16);
+ const uint8x16_t AR123456789abcdef_ = vld1q_u8(above + 17);
+ const uint8x16_t avg_2 = vhaddq_u8(XAL0123456789abcde, AL123456789abcdefg);
+ const uint8x16_t row_2 = vrhaddq_u8(avg_2, AL0123456789abcdef);
+ const uint8x16_t avg_3 = vhaddq_u8(ALfR0123456789abcde, AR123456789abcdef_);
+ const uint8x16_t row_3 = vrhaddq_u8(avg_3, AR0123456789abcdef);
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 15);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 15);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 14);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 14);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 14);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 13);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 13);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 13);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 12);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 12);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 12);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 11);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 11);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 11);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 10);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 10);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 10);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 9);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 9);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 9);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 8);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 8);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 8);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 7);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 7);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 7);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 6);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 6);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 6);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 5);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 5);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 5);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 4);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 4);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 4);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 3);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 3);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 3);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 2);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 2);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 2);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 1);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 1);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 1);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ d135_store_32x2(&dst, stride, row_0, row_1, row_2);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+ uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02;
+
+ az = load_unaligned_u8_4x1(above - 1);
+ a0 = load_unaligned_u8_4x1(above + 0);
+ // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+ l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+ l0 = load_unaligned_u8_4x1(left + 0);
+ l1 = load_unaligned_u8_4x1(left + 1);
+ // [ above[-1], left[0], left[1], left[2], x, x, x, x ]
+ azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+ d0 = vrhadd_u8(azl0, l0);
+ d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+ d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+ d02 = vrev64_u8(vzip_u8(d0, d2).val[0]);
+
+ store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7));
+ store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5));
+ store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3));
+ store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1));
+}
+
+void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+ az = vld1_u8(above - 1);
+ a0 = vld1_u8(above + 0);
+ // [ left[0], above[-1], ... , above[5] ]
+ l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+ l0 = vld1_u8(left);
+ // The last lane here is unused, reading left[8] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[7], x ]
+ l1 = vext_u8(l0, l0, 1);
+ // [ above[-1], left[0], ... , left[6] ]
+ azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+ // d0[0] = AVG2(above[-1], left[0])
+ // d0[1] = AVG2(left[0], left[1])
+ // ...
+ // d0[7] = AVG2(left[6], left[7])
+ d0 = vrhadd_u8(azl0, l0);
+
+ // d1[0] = AVG3(left[0], above[-1], above[0])
+ // d1[1] = AVG3(above[-1], above[0], above[1])
+ // ...
+ // d1[7] = AVG3(above[5], above[6], above[7])
+ d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+ // d2[0] = AVG3(above[-1], left[0], left[1])
+ // d2[1] = AVG3(left[0], left[1], left[2])
+ // ...
+ // d2[6] = AVG3(left[5], left[6], left[7])
+ // d2[7] = x (don't care)
+ d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+ // The ext instruction shifts elements in from the end of the vector rather
+ // than the start, so reverse the vectors to put the elements to be shifted
+ // in at the end. The lowest lane of d02_lo is unused.
+ d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0];
+ d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1];
+
+ // Incrementally shift more elements from d0/d2 reversed into d1:
+ // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+ // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+ // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+ // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+ // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+ // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+ // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+ // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+ vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7));
+ vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5));
+ vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3));
+ vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1));
+ vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7));
+ vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5));
+ vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3));
+ vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+ uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+ az = vld1q_u8(above - 1);
+ a0 = vld1q_u8(above + 0);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+ l0 = vld1q_u8(left + 0);
+ // The last lane here is unused, reading left[16] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[15], x ]
+ l1 = vextq_u8(l0, l0, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+ d0 = vrhaddq_u8(azl0, l0);
+ d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+ d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+
+ d0 = vrev64q_u8(vextq_u8(d0, d0, 8));
+ d2 = vrev64q_u8(vextq_u8(d2, d2, 8));
+
+ // The lowest lane of d02_lo is unused.
+ d02_lo = vzipq_u8(d2, d0).val[0];
+ d02_hi = vzipq_u8(d2, d0).val[1];
+
+ vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15));
+ vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13));
+ vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11));
+ vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9));
+ vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7));
+ vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5));
+ vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3));
+ vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1));
+ vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15));
+ vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13));
+ vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11));
+ vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9));
+ vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7));
+ vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5));
+ vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3));
+ vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+ uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo,
+ d0_hi, d1_lo, d1_hi, d2_lo, d2_hi;
+ uint8x16x2_t d02_hi, d02_lo;
+
+ az = vld1q_u8(above - 1);
+ a0 = vld1q_u8(above + 0);
+ a14 = vld1q_u8(above + 14);
+ a15 = vld1q_u8(above + 15);
+ a16 = vld1q_u8(above + 16);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+ l0 = vld1q_u8(left);
+ l1 = vld1q_u8(left + 1);
+ l15 = vld1q_u8(left + 15);
+ l16 = vld1q_u8(left + 16);
+ // The last lane here is unused, reading left[32] would cause a buffer
+ // over-read (observed as an address-sanitizer failure), so just fill with a
+ // duplicate of left[16] to avoid needing to materialize a zero:
+ // [ left[17], ... , left[31], x ]
+ l17 = vextq_u8(l16, l16, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+ d0_lo = vrhaddq_u8(azl0, l0);
+ d0_hi = vrhaddq_u8(l15, l16);
+
+ d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+ d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+ // The highest lane of d2_hi is unused.
+ d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+ d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+ d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8));
+ d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8));
+
+ d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8));
+ d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8));
+
+ // d02_hi.val[0][0] is unused here.
+ d02_hi = vzipq_u8(d2_hi, d0_hi);
+ d02_lo = vzipq_u8(d2_lo, d0_lo);
+
+ vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15));
+ vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+ vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13));
+ vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+ vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11));
+ vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+ vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9));
+ vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+ vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7));
+ vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+ vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5));
+ vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+ vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3));
+ vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+ vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1));
+ vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
+ vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+ vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15));
+ vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+ vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13));
+ vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+ vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11));
+ vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+ vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9));
+ vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+ vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7));
+ vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+ vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5));
+ vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+ vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3));
+ vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+ vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1));
+ vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+ vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+ vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+ vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+ vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+ vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+ vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+ vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+ vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+ vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+ vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+ vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+ vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+ vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+ vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+ vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+ vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15));
+ vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+ vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13));
+ vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+ vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11));
+ vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+ vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9));
+ vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+ vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7));
+ vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+ vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5));
+ vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+ vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3));
+ vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+ vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1));
+ vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1;
+ (void)above;
+
+ // We need the low half lanes here for the c0/c1 arithmetic but the high half
+ // lanes for the ext:
+ // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ]
+ l0 = load_replicate_u8_4x1(left + 0);
+ l3 = vld1_dup_u8(left + 3);
+
+ // [ left[1], left[2], left[3], left[3], x, x, x, x ]
+ l1 = vext_u8(l0, l3, 5);
+ // [ left[2], left[3], left[3], left[3], x, x, x, x ]
+ l2 = vext_u8(l0, l3, 6);
+
+ c0 = vrhadd_u8(l0, l1);
+ c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+ // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ]
+ c01 = vzip_u8(c0, c1).val[0];
+
+ d0 = c01;
+ d1 = vext_u8(c01, l3, 2);
+
+ // Store the high half of the vector for stride={2,3} to avoid needing
+ // additional ext instructions:
+ // stride=0 [ c0[0], c1[0], c0[1], c1[1] ]
+ // stride=1 [ c0[1], c1[1], c0[2], c1[2] ]
+ // stride=2 [ c0[2], c1[2], c0[3], c1[3] ]
+ // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+ store_u8_4x1(dst + 0 * stride, d0);
+ store_u8_4x1(dst + 1 * stride, d1);
+ store_u8_4x1_high(dst + 2 * stride, d0);
+ store_u8_4x1_high(dst + 3 * stride, d1);
+}
+
+void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+ (void)above;
+
+ l0 = vld1_u8(left + 0);
+ l7 = vld1_dup_u8(left + 7);
+
+ // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+ l1 = vext_u8(l0, l7, 1);
+ // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+ l2 = vext_u8(l0, l7, 2);
+
+ c0 = vrhadd_u8(l0, l1);
+ c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+ c01_lo = vzip_u8(c0, c1).val[0];
+ c01_hi = vzip_u8(c0, c1).val[1];
+
+ vst1_u8(dst + 0 * stride, c01_lo);
+ vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2));
+ vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4));
+ vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6));
+ vst1_u8(dst + 4 * stride, c01_hi);
+ vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2));
+ vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4));
+ vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6));
+}
+
+void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+ (void)above;
+
+ l0 = vld1q_u8(left + 0);
+ l15 = vld1q_dup_u8(left + 15);
+
+ l1 = vextq_u8(l0, l15, 1);
+ l2 = vextq_u8(l0, l15, 2);
+
+ c0 = vrhaddq_u8(l0, l1);
+ c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1);
+
+ c01_lo = vzipq_u8(c0, c1).val[0];
+ c01_hi = vzipq_u8(c0, c1).val[1];
+
+ vst1q_u8(dst + 0 * stride, c01_lo);
+ vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2));
+ vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4));
+ vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6));
+ vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8));
+ vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10));
+ vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12));
+ vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14));
+ vst1q_u8(dst + 8 * stride, c01_hi);
+ vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2));
+ vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4));
+ vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6));
+ vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8));
+ vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10));
+ vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12));
+ vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14));
+}
+
+void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo,
+ c1_hi, c01[4];
+ (void)above;
+
+ l0_lo = vld1q_u8(left + 0);
+ l0_hi = vld1q_u8(left + 16);
+ l31 = vld1q_dup_u8(left + 31);
+
+ l1_lo = vextq_u8(l0_lo, l0_hi, 1);
+ l1_hi = vextq_u8(l0_hi, l31, 1);
+ l2_lo = vextq_u8(l0_lo, l0_hi, 2);
+ l2_hi = vextq_u8(l0_hi, l31, 2);
+
+ c0_lo = vrhaddq_u8(l0_lo, l1_lo);
+ c0_hi = vrhaddq_u8(l0_hi, l1_hi);
+ c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo);
+ c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi);
+
+ c01[0] = vzipq_u8(c0_lo, c1_lo).val[0];
+ c01[1] = vzipq_u8(c0_lo, c1_lo).val[1];
+ c01[2] = vzipq_u8(c0_hi, c1_hi).val[0];
+ c01[3] = vzipq_u8(c0_hi, c1_hi).val[1];
+
+ vst1q_u8(dst + 0 * stride + 0, c01[0]);
+ vst1q_u8(dst + 0 * stride + 16, c01[1]);
+ vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2));
+ vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2));
+ vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4));
+ vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4));
+ vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6));
+ vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6));
+ vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8));
+ vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8));
+ vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10));
+ vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10));
+ vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12));
+ vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12));
+ vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14));
+ vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14));
+ vst1q_u8(dst + 8 * stride + 0, c01[1]);
+ vst1q_u8(dst + 8 * stride + 16, c01[2]);
+ vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2));
+ vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2));
+ vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4));
+ vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4));
+ vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6));
+ vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6));
+ vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8));
+ vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8));
+ vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10));
+ vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10));
+ vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12));
+ vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12));
+ vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14));
+ vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14));
+ vst1q_u8(dst + 16 * stride + 0, c01[2]);
+ vst1q_u8(dst + 16 * stride + 16, c01[3]);
+ vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2));
+ vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2));
+ vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4));
+ vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4));
+ vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6));
+ vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6));
+ vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8));
+ vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8));
+ vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10));
+ vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10));
+ vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12));
+ vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12));
+ vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14));
+ vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14));
+ vst1q_u8(dst + 24 * stride + 0, c01[3]);
+ vst1q_u8(dst + 24 * stride + 16, l31);
+ vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2));
+ vst1q_u8(dst + 25 * stride + 16, l31);
+ vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4));
+ vst1q_u8(dst + 26 * stride + 16, l31);
+ vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6));
+ vst1q_u8(dst + 27 * stride + 16, l31);
+ vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8));
+ vst1q_u8(dst + 28 * stride + 16, l31);
+ vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10));
+ vst1q_u8(dst + 29 * stride + 16, l31);
+ vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12));
+ vst1q_u8(dst + 30 * stride + 16, l31);
+ vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14));
+ vst1q_u8(dst + 31 * stride + 16, l31);
+}
+
+// -----------------------------------------------------------------------------
+
+#if !HAVE_NEON_ASM
+
+void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint32_t d = *(const uint32_t *)above;
+ int i;
+ (void)left;
+
+ for (i = 0; i < 4; i++, dst += stride) {
+ *(uint32_t *)dst = d;
+ }
+}
+
+void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d = vld1_u8(above);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 8; i++, dst += stride) {
+ vst1_u8(dst, d);
+ }
+}
+
+void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d = vld1q_u8(above);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 16; i++, dst += stride) {
+ vst1q_u8(dst, d);
+ }
+}
+
+void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 32; i++) {
+ // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8.
+ // clang-3.8 unrolled the loop fully with no filler so the cause is likely
+ // the latency of the instruction.
+ vst1q_u8(dst, d0);
+ dst += 16;
+ vst1q_u8(dst, d1);
+ dst += stride - 16;
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint32x2_t zero = vdup_n_u32(0);
+ const uint8x8_t left_u8 =
+ vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0));
+ uint8x8_t d;
+ (void)above;
+
+ d = vdup_lane_u8(left_u8, 0);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 1);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 2);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 3);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+}
+
+void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t left_u8 = vld1_u8(left);
+ uint8x8_t d;
+ (void)above;
+
+ d = vdup_lane_u8(left_u8, 0);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 1);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 2);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 3);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 4);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 5);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 6);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 7);
+ vst1_u8(dst, d);
+}
+
+static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x8_t left) {
+ const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+ const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+ const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+ const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+ const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+ const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+ const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+ const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+ vst1q_u8(*dst, row_0);
+ *dst += stride;
+ vst1q_u8(*dst, row_1);
+ *dst += stride;
+ vst1q_u8(*dst, row_2);
+ *dst += stride;
+ vst1q_u8(*dst, row_3);
+ *dst += stride;
+ vst1q_u8(*dst, row_4);
+ *dst += stride;
+ vst1q_u8(*dst, row_5);
+ *dst += stride;
+ vst1q_u8(*dst, row_6);
+ *dst += stride;
+ vst1q_u8(*dst, row_7);
+ *dst += stride;
+}
+
+void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t left_u8q = vld1q_u8(left);
+ (void)above;
+
+ h_store_16x8(&dst, stride, vget_low_u8(left_u8q));
+ h_store_16x8(&dst, stride, vget_high_u8(left_u8q));
+}
+
+static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x8_t left) {
+ const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+ const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+ const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+ const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+ const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+ const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+ const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+ const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+ vst1q_u8(*dst, row_0); // Note clang-3.8 produced poor code w/vst2q_u8
+ *dst += 16;
+ vst1q_u8(*dst, row_0);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_1);
+ *dst += 16;
+ vst1q_u8(*dst, row_1);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_2);
+ *dst += 16;
+ vst1q_u8(*dst, row_2);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_3);
+ *dst += 16;
+ vst1q_u8(*dst, row_3);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_4);
+ *dst += 16;
+ vst1q_u8(*dst, row_4);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_5);
+ *dst += 16;
+ vst1q_u8(*dst, row_5);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_6);
+ *dst += 16;
+ vst1q_u8(*dst, row_6);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_7);
+ *dst += 16;
+ vst1q_u8(*dst, row_7);
+ *dst += stride - 16;
+}
+
+void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ (void)above;
+
+ for (i = 0; i < 2; i++, left += 16) {
+ const uint8x16_t left_u8 = vld1q_u8(left);
+ h_store_32x8(&dst, stride, vget_low_u8(left_u8));
+ h_store_32x8(&dst, stride, vget_high_u8(left_u8));
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) {
+ return vreinterpretq_s16_u16(vmovl_u8(v));
+}
+
+void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t top_left = vld1_dup_u8(above - 1);
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const uint8x8_t above_u8 = vld1_u8(above);
+ const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8));
+ int16x8_t sub, sum;
+ uint32x2_t d;
+
+ sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+ // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8.
+ sub = vreinterpretq_s16_s64(
+ vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0));
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+ sum = vaddq_s16(sum, sub);
+ d = vreinterpret_u32_u8(vqmovun_s16(sum));
+ vst1_lane_u32((uint32_t *)dst, d, 0);
+ dst += stride;
+ vst1_lane_u32((uint32_t *)dst, d, 1);
+ dst += stride;
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+ sum = vaddq_s16(sum, sub);
+ d = vreinterpret_u32_u8(vqmovun_s16(sum));
+ vst1_lane_u32((uint32_t *)dst, d, 0);
+ dst += stride;
+ vst1_lane_u32((uint32_t *)dst, d, 1);
+}
+
+static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub) {
+ const int16x8_t sum = vaddq_s16(left_dup, sub);
+ const uint8x8_t d = vqmovun_s16(sum);
+ vst1_u8(*dst, d);
+ *dst += stride;
+}
+
+void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t top_left = vld1_dup_u8(above - 1);
+ const uint8x8_t above_u8 = vld1_u8(above);
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+ const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ int i;
+
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+ int16x8_t left_dup;
+
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_8_kernel(&dst, stride, left_dup, sub);
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_8_kernel(&dst, stride, left_dup, sub);
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_8_kernel(&dst, stride, left_dup, sub);
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_8_kernel(&dst, stride, left_dup, sub);
+ }
+}
+
+static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1) {
+ const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ const uint8x8_t d0 = vqmovun_s16(sum0);
+ const uint8x8_t d1 = vqmovun_s16(sum1);
+ vst1_u8(*dst, d0);
+ *dst += 8;
+ vst1_u8(*dst, d1);
+ *dst += stride - 8;
+}
+
+void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+ const uint8x16_t above_u8 = vld1q_u8(above);
+ const int16x8_t sub0 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left)));
+ const int16x8_t sub1 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left)));
+ int16x8_t left_dup;
+ int i;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+ const int16x4_t left_low = vget_low_s16(left_s16q);
+ const int16x4_t left_high = vget_high_s16(left_s16q);
+
+ left_dup = vdupq_lane_s16(left_low, 0);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_low, 1);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_low, 2);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_low, 3);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+
+ left_dup = vdupq_lane_s16(left_high, 0);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_high, 1);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_high, 2);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_high, 3);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ }
+}
+
+static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1, const int16x8_t sub2,
+ const int16x8_t sub3) {
+ const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ const int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+ const int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+ const uint8x8_t d0 = vqmovun_s16(sum0);
+ const uint8x8_t d1 = vqmovun_s16(sum1);
+ const uint8x8_t d2 = vqmovun_s16(sum2);
+ const uint8x8_t d3 = vqmovun_s16(sum3);
+
+ vst1q_u8(*dst, vcombine_u8(d0, d1));
+ *dst += 16;
+ vst1q_u8(*dst, vcombine_u8(d2, d3));
+ *dst += stride - 16;
+}
+
+void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+ const uint8x16_t above_low = vld1q_u8(above);
+ const uint8x16_t above_high = vld1q_u8(above + 16);
+ const int16x8_t sub0 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left)));
+ const int16x8_t sub1 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left)));
+ const int16x8_t sub2 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left)));
+ const int16x8_t sub3 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left)));
+ int16x8_t left_dup;
+ int i, j;
+
+ for (j = 0; j < 4; j++, left += 8) {
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+ }
+ }
+}
+#endif // !HAVE_NEON_ASM
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm
new file mode 100644
index 0000000000..115790d480
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm
@@ -0,0 +1,630 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_v_predictor_4x4_neon|
+ EXPORT |vpx_v_predictor_8x8_neon|
+ EXPORT |vpx_v_predictor_16x16_neon|
+ EXPORT |vpx_v_predictor_32x32_neon|
+ EXPORT |vpx_h_predictor_4x4_neon|
+ EXPORT |vpx_h_predictor_8x8_neon|
+ EXPORT |vpx_h_predictor_16x16_neon|
+ EXPORT |vpx_h_predictor_32x32_neon|
+ EXPORT |vpx_tm_predictor_4x4_neon|
+ EXPORT |vpx_tm_predictor_8x8_neon|
+ EXPORT |vpx_tm_predictor_16x16_neon|
+ EXPORT |vpx_tm_predictor_32x32_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_4x4_neon| PROC
+ vld1.32 {d0[0]}, [r2]
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ bx lr
+ ENDP ; |vpx_v_predictor_4x4_neon|
+
+;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_8x8_neon| PROC
+ vld1.8 {d0}, [r2]
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_v_predictor_8x8_neon|
+
+;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_16x16_neon| PROC
+ vld1.8 {q0}, [r2]
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_v_predictor_16x16_neon|
+
+;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_32x32_neon| PROC
+ vld1.8 {q0, q1}, [r2]
+ mov r2, #2
+loop_v
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ subs r2, r2, #1
+ bgt loop_v
+ bx lr
+ ENDP ; |vpx_v_predictor_32x32_neon|
+
+;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_4x4_neon| PROC
+ vld1.32 {d1[0]}, [r3]
+ vdup.8 d0, d1[0]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[1]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[2]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[3]
+ vst1.32 {d0[0]}, [r0], r1
+ bx lr
+ ENDP ; |vpx_h_predictor_4x4_neon|
+
+;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_8x8_neon| PROC
+ vld1.64 {d1}, [r3]
+ vdup.8 d0, d1[0]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[1]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[2]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[3]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[4]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[5]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[6]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[7]
+ vst1.64 {d0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_h_predictor_8x8_neon|
+
+;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_16x16_neon| PROC
+ vld1.8 {q1}, [r3]
+ vdup.8 q0, d2[0]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[1]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[2]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[3]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[4]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[5]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[6]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[7]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[0]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[1]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[2]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[3]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[4]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[5]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[6]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[7]
+ vst1.8 {q0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_h_predictor_16x16_neon|
+
+;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_32x32_neon| PROC
+ sub r1, r1, #16
+ mov r2, #2
+loop_h
+ vld1.8 {q1}, [r3]!
+ vdup.8 q0, d2[0]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[1]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[2]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[3]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[4]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[5]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[6]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[7]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[0]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[1]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[2]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[3]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[4]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[5]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[6]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[7]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ subs r2, r2, #1
+ bgt loop_h
+ bx lr
+ ENDP ; |vpx_h_predictor_32x32_neon|
+
+;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_4x4_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.u8 {d0[]}, [r12]
+
+ ; Load above 4 pixels
+ vld1.32 {d2[0]}, [r2]
+
+ ; Compute above - ytop_left
+ vsubl.u8 q3, d2, d0
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; 1st row and 2nd row
+ vld1.u8 {d2[]}, [r3]!
+ vld1.u8 {d4[]}, [r3]!
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vadd.s16 q1, q1, q3
+ vadd.s16 q2, q2, q3
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+
+ ; 3rd row and 4th row
+ vld1.u8 {d2[]}, [r3]!
+ vld1.u8 {d4[]}, [r3]
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vadd.s16 q1, q1, q3
+ vadd.s16 q2, q2, q3
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+ bx lr
+ ENDP ; |vpx_tm_predictor_4x4_neon|
+
+;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_8x8_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; preload 8 left
+ vld1.8 {d30}, [r3]
+
+ ; Load above 8 pixels
+ vld1.64 {d2}, [r2]
+
+ vmovl.u8 q10, d30
+
+ ; Compute above - ytop_left
+ vsubl.u8 q3, d2, d0
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; 1st row and 2nd row
+ vdup.16 q0, d20[0]
+ vdup.16 q1, d20[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
+
+ ; 3rd row and 4th row
+ vdup.16 q8, d20[2]
+ vdup.16 q9, d20[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
+
+ vst1.64 {d0}, [r0], r1
+ vst1.64 {d1}, [r0], r1
+ vst1.64 {d2}, [r0], r1
+ vst1.64 {d3}, [r0], r1
+
+ ; 5th row and 6th row
+ vdup.16 q0, d21[0]
+ vdup.16 q1, d21[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
+
+ ; 7th row and 8th row
+ vdup.16 q8, d21[2]
+ vdup.16 q9, d21[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
+
+ vst1.64 {d0}, [r0], r1
+ vst1.64 {d1}, [r0], r1
+ vst1.64 {d2}, [r0], r1
+ vst1.64 {d3}, [r0], r1
+
+ bx lr
+ ENDP ; |vpx_tm_predictor_8x8_neon|
+
+;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_16x16_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; Load above 8 pixels
+ vld1.8 {q1}, [r2]
+
+ ; preload 8 left into r12
+ vld1.8 {d18}, [r3]!
+
+ ; Compute above - ytop_left
+ vsubl.u8 q2, d2, d0
+ vsubl.u8 q3, d3, d0
+
+ vmovl.u8 q10, d18
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
+ mov r2, #2
+
+loop_16x16_neon
+ ; Process two rows.
+ vdup.16 q0, d20[0]
+ vdup.16 q8, d20[1]
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d20[2] ; proload next 2 rows data
+ vdup.16 q8, d20[3]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d21[0] ; proload next 2 rows data
+ vdup.16 q8, d21[1]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d21[2] ; proload next 2 rows data
+ vdup.16 q8, d21[3]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vld1.8 {d18}, [r3]! ; preload 8 left into r12
+ vmovl.u8 q10, d18
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ subs r2, r2, #1
+ bgt loop_16x16_neon
+
+ bx lr
+ ENDP ; |vpx_tm_predictor_16x16_neon|
+
+;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_32x32_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; Load above 32 pixels
+ vld1.8 {q1}, [r2]!
+ vld1.8 {q2}, [r2]
+
+ ; preload 8 left pixels
+ vld1.8 {d26}, [r3]!
+
+ ; Compute above - ytop_left
+ vsubl.u8 q8, d2, d0
+ vsubl.u8 q9, d3, d0
+ vsubl.u8 q10, d4, d0
+ vsubl.u8 q11, d5, d0
+
+ vmovl.u8 q3, d26
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
+ mov r2, #4
+
+loop_32x32_neon
+ ; Process two rows.
+ vdup.16 q0, d6[0]
+ vdup.16 q2, d6[1]
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q1, d6[2]
+ vdup.16 q2, d6[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q1, q8
+ vadd.s16 q13, q1, q9
+ vadd.s16 q14, q1, q10
+ vadd.s16 q15, q1, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q0, d7[0]
+ vdup.16 q2, d7[1]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q0, d7[2]
+ vdup.16 q2, d7[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vld1.8 {d0}, [r3]! ; preload 8 left pixels
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vmovl.u8 q3, d0
+ vst1.64 {d24-d27}, [r0], r1
+
+ subs r2, r2, #1
+ bgt loop_32x32_neon
+
+ bx lr
+ ENDP ; |vpx_tm_predictor_32x32_neon|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm
new file mode 100644
index 0000000000..730c40de0e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm
@@ -0,0 +1,666 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_lpf_horizontal_16_neon|
+ EXPORT |vpx_lpf_horizontal_16_dual_neon|
+ EXPORT |vpx_lpf_vertical_16_neon|
+ EXPORT |vpx_lpf_vertical_16_dual_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void mb_lpf_horizontal_edge(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; r12 int count
+|mb_lpf_horizontal_edge| PROC
+ push {r4-r8, lr}
+ vpush {d8-d15}
+ ldr r4, [sp, #88] ; load thresh
+
+h_count
+ vld1.8 {d16[]}, [r2] ; load *blimit
+ vld1.8 {d17[]}, [r3] ; load *limit
+ vld1.8 {d18[]}, [r4] ; load *thresh
+
+ sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines
+
+ vld1.u8 {d0}, [r8@64], r1 ; p7
+ vld1.u8 {d1}, [r8@64], r1 ; p6
+ vld1.u8 {d2}, [r8@64], r1 ; p5
+ vld1.u8 {d3}, [r8@64], r1 ; p4
+ vld1.u8 {d4}, [r8@64], r1 ; p3
+ vld1.u8 {d5}, [r8@64], r1 ; p2
+ vld1.u8 {d6}, [r8@64], r1 ; p1
+ vld1.u8 {d7}, [r8@64], r1 ; p0
+ vld1.u8 {d8}, [r8@64], r1 ; q0
+ vld1.u8 {d9}, [r8@64], r1 ; q1
+ vld1.u8 {d10}, [r8@64], r1 ; q2
+ vld1.u8 {d11}, [r8@64], r1 ; q3
+ vld1.u8 {d12}, [r8@64], r1 ; q4
+ vld1.u8 {d13}, [r8@64], r1 ; q5
+ vld1.u8 {d14}, [r8@64], r1 ; q6
+ vld1.u8 {d15}, [r8@64], r1 ; q7
+
+ bl vpx_wide_mbfilter_neon
+
+ tst r7, #1
+ beq h_mbfilter
+
+ ; flat && mask were not set for any of the channels. Just store the values
+ ; from filter.
+ sub r8, r0, r1, lsl #1
+
+ vst1.u8 {d25}, [r8@64], r1 ; store op1
+ vst1.u8 {d24}, [r8@64], r1 ; store op0
+ vst1.u8 {d23}, [r8@64], r1 ; store oq0
+ vst1.u8 {d26}, [r8@64], r1 ; store oq1
+
+ b h_next
+
+h_mbfilter
+ tst r7, #2
+ beq h_wide_mbfilter
+
+ ; flat2 was not set for any of the channels. Just store the values from
+ ; mbfilter.
+ sub r8, r0, r1, lsl #1
+ sub r8, r8, r1
+
+ vst1.u8 {d18}, [r8@64], r1 ; store op2
+ vst1.u8 {d19}, [r8@64], r1 ; store op1
+ vst1.u8 {d20}, [r8@64], r1 ; store op0
+ vst1.u8 {d21}, [r8@64], r1 ; store oq0
+ vst1.u8 {d22}, [r8@64], r1 ; store oq1
+ vst1.u8 {d23}, [r8@64], r1 ; store oq2
+
+ b h_next
+
+h_wide_mbfilter
+ sub r8, r0, r1, lsl #3
+ add r8, r8, r1
+
+ vst1.u8 {d16}, [r8@64], r1 ; store op6
+ vst1.u8 {d24}, [r8@64], r1 ; store op5
+ vst1.u8 {d25}, [r8@64], r1 ; store op4
+ vst1.u8 {d26}, [r8@64], r1 ; store op3
+ vst1.u8 {d27}, [r8@64], r1 ; store op2
+ vst1.u8 {d18}, [r8@64], r1 ; store op1
+ vst1.u8 {d19}, [r8@64], r1 ; store op0
+ vst1.u8 {d20}, [r8@64], r1 ; store oq0
+ vst1.u8 {d21}, [r8@64], r1 ; store oq1
+ vst1.u8 {d22}, [r8@64], r1 ; store oq2
+ vst1.u8 {d23}, [r8@64], r1 ; store oq3
+ vst1.u8 {d1}, [r8@64], r1 ; store oq4
+ vst1.u8 {d2}, [r8@64], r1 ; store oq5
+ vst1.u8 {d3}, [r8@64], r1 ; store oq6
+
+h_next
+ add r0, r0, #8
+ subs r12, r12, #1
+ bne h_count
+
+ vpop {d8-d15}
+ pop {r4-r8, pc}
+
+ ENDP ; |mb_lpf_horizontal_edge|
+
+; void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|vpx_lpf_horizontal_16_neon| PROC
+ mov r12, #1
+ b mb_lpf_horizontal_edge
+ ENDP ; |vpx_lpf_horizontal_16_neon|
+
+; void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|vpx_lpf_horizontal_16_dual_neon| PROC
+ mov r12, #2
+ b mb_lpf_horizontal_edge
+ ENDP ; |vpx_lpf_horizontal_16_dual_neon|
+
+; void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+; const uint8_t *limit, const uint8_t *thresh,
+; int count) {
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; r12 int count
+|mb_lpf_vertical_edge_w| PROC
+ push {r4-r8, lr}
+ vpush {d8-d15}
+ ldr r4, [sp, #88] ; load thresh
+
+v_count
+ vld1.8 {d16[]}, [r2] ; load *blimit
+ vld1.8 {d17[]}, [r3] ; load *limit
+ vld1.8 {d18[]}, [r4] ; load *thresh
+
+ sub r8, r0, #8
+
+ vld1.8 {d0}, [r8@64], r1
+ vld1.8 {d8}, [r0@64], r1
+ vld1.8 {d1}, [r8@64], r1
+ vld1.8 {d9}, [r0@64], r1
+ vld1.8 {d2}, [r8@64], r1
+ vld1.8 {d10}, [r0@64], r1
+ vld1.8 {d3}, [r8@64], r1
+ vld1.8 {d11}, [r0@64], r1
+ vld1.8 {d4}, [r8@64], r1
+ vld1.8 {d12}, [r0@64], r1
+ vld1.8 {d5}, [r8@64], r1
+ vld1.8 {d13}, [r0@64], r1
+ vld1.8 {d6}, [r8@64], r1
+ vld1.8 {d14}, [r0@64], r1
+ vld1.8 {d7}, [r8@64], r1
+ vld1.8 {d15}, [r0@64], r1
+
+ sub r0, r0, r1, lsl #3
+
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ vtrn.8 d0, d1
+ vtrn.8 d2, d3
+ vtrn.8 d4, d5
+ vtrn.8 d6, d7
+
+ vtrn.8 d8, d9
+ vtrn.8 d10, d11
+ vtrn.8 d12, d13
+ vtrn.8 d14, d15
+
+ bl vpx_wide_mbfilter_neon
+
+ tst r7, #1
+ beq v_mbfilter
+
+ ; flat && mask were not set for any of the channels. Just store the values
+ ; from filter.
+ sub r0, #2
+
+ vswp d23, d25
+
+ vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r0], r1
+ vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r0], r1
+ vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r0], r1
+ vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r0], r1
+ vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r0], r1
+ vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r0], r1
+ vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r0], r1
+ vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r0], r1
+ add r0, #2
+
+ b v_next
+
+v_mbfilter
+ tst r7, #2
+ beq v_wide_mbfilter
+
+ ; flat2 was not set for any of the channels. Just store the values from
+ ; mbfilter.
+ sub r8, r0, #3
+
+ vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1
+ vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1
+ vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1
+ vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1
+ vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1
+ vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1
+ vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1
+ vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1
+ vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1
+ vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1
+ vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1
+ vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1
+ vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1
+ vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1
+ vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1
+ vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1
+
+ b v_next
+
+v_wide_mbfilter
+ sub r8, r0, #8
+
+ vtrn.32 d0, d26
+ vtrn.32 d16, d27
+ vtrn.32 d24, d18
+ vtrn.32 d25, d19
+
+ vtrn.16 d0, d24
+ vtrn.16 d16, d25
+ vtrn.16 d26, d18
+ vtrn.16 d27, d19
+
+ vtrn.8 d0, d16
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+ vtrn.8 d18, d19
+
+ vtrn.32 d20, d1
+ vtrn.32 d21, d2
+ vtrn.32 d22, d3
+ vtrn.32 d23, d15
+
+ vtrn.16 d20, d22
+ vtrn.16 d21, d23
+ vtrn.16 d1, d3
+ vtrn.16 d2, d15
+
+ vtrn.8 d20, d21
+ vtrn.8 d22, d23
+ vtrn.8 d1, d2
+ vtrn.8 d3, d15
+
+ vst1.8 {d0}, [r8@64], r1
+ vst1.8 {d20}, [r0@64], r1
+ vst1.8 {d16}, [r8@64], r1
+ vst1.8 {d21}, [r0@64], r1
+ vst1.8 {d24}, [r8@64], r1
+ vst1.8 {d22}, [r0@64], r1
+ vst1.8 {d25}, [r8@64], r1
+ vst1.8 {d23}, [r0@64], r1
+ vst1.8 {d26}, [r8@64], r1
+ vst1.8 {d1}, [r0@64], r1
+ vst1.8 {d27}, [r8@64], r1
+ vst1.8 {d2}, [r0@64], r1
+ vst1.8 {d18}, [r8@64], r1
+ vst1.8 {d3}, [r0@64], r1
+ vst1.8 {d19}, [r8@64], r1
+ vst1.8 {d15}, [r0@64], r1
+
+v_next
+ subs r12, #1
+ bne v_count
+
+ vpop {d8-d15}
+ pop {r4-r8, pc}
+
+ ENDP ; |mb_lpf_vertical_edge_w|
+
+; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+; const uint8_t *limit, const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|vpx_lpf_vertical_16_neon| PROC
+ mov r12, #1
+ b mb_lpf_vertical_edge_w
+ ENDP ; |vpx_lpf_vertical_16_neon|
+
+; void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|vpx_lpf_vertical_16_dual_neon| PROC
+ mov r12, #2
+ b mb_lpf_vertical_edge_w
+ ENDP ; |vpx_lpf_vertical_16_dual_neon|
+
+; void vpx_wide_mbfilter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+;
+; r0-r3 PRESERVE
+; d16 blimit
+; d17 limit
+; d18 thresh
+; d0 p7
+; d1 p6
+; d2 p5
+; d3 p4
+; d4 p3
+; d5 p2
+; d6 p1
+; d7 p0
+; d8 q0
+; d9 q1
+; d10 q2
+; d11 q3
+; d12 q4
+; d13 q5
+; d14 q6
+; d15 q7
+|vpx_wide_mbfilter_neon| PROC
+ mov r7, #0
+
+ ; filter_mask
+ vabd.u8 d19, d4, d5 ; abs(p3 - p2)
+ vabd.u8 d20, d5, d6 ; abs(p2 - p1)
+ vabd.u8 d21, d6, d7 ; abs(p1 - p0)
+ vabd.u8 d22, d9, d8 ; abs(q1 - q0)
+ vabd.u8 d23, d10, d9 ; abs(q2 - q1)
+ vabd.u8 d24, d11, d10 ; abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1))
+ vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0))
+ vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2))
+ vmax.u8 d19, d19, d20
+
+ vabd.u8 d24, d7, d8 ; abs(p0 - q0)
+
+ vmax.u8 d19, d19, d23
+
+ vabd.u8 d23, d6, d9 ; a = abs(p1 - q1)
+ vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2
+
+ ; abs () > limit
+ vcge.u8 d19, d17, d19
+
+ ; flatmask4
+ vabd.u8 d25, d7, d5 ; abs(p0 - p2)
+ vabd.u8 d26, d8, d10 ; abs(q0 - q2)
+ vabd.u8 d27, d4, d7 ; abs(p3 - p0)
+ vabd.u8 d28, d11, d8 ; abs(q3 - q0)
+
+ ; only compare the largest value to thresh
+ vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2))
+ vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0))
+ vmax.u8 d25, d25, d26
+ vmax.u8 d20, d20, d25
+
+ vshr.u8 d23, d23, #1 ; a = a / 2
+ vqadd.u8 d24, d24, d23 ; a = b + a
+
+ vmov.u8 d30, #1
+ vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1
+
+ vcge.u8 d20, d30, d20 ; flat
+
+ vand d19, d19, d24 ; mask
+
+ ; hevmask
+ vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1
+ vorr d21, d21, d22 ; hev
+
+ vand d16, d20, d19 ; flat && mask
+ vmov r5, r6, d16
+
+ ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
+ vabd.u8 d22, d3, d7 ; abs(p4 - p0)
+ vabd.u8 d23, d12, d8 ; abs(q4 - q0)
+ vabd.u8 d24, d7, d2 ; abs(p0 - p5)
+ vabd.u8 d25, d8, d13 ; abs(q0 - q5)
+ vabd.u8 d26, d1, d7 ; abs(p6 - p0)
+ vabd.u8 d27, d14, d8 ; abs(q6 - q0)
+ vabd.u8 d28, d0, d7 ; abs(p7 - p0)
+ vabd.u8 d29, d15, d8 ; abs(q7 - q0)
+
+ ; only compare the largest value to thresh
+ vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0))
+ vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5))
+ vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0))
+ vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0))
+
+ vmax.u8 d26, d22, d23
+ vmax.u8 d27, d24, d25
+ vmax.u8 d23, d26, d27
+
+ vcge.u8 d18, d30, d23 ; flat2
+
+ vmov.u8 d22, #0x80
+
+ orrs r5, r5, r6 ; Check for 0
+ orreq r7, r7, #1 ; Only do filter branch
+
+ vand d17, d18, d16 ; flat2 && flat && mask
+ vmov r5, r6, d17
+
+ ; mbfilter() function
+
+ ; filter() function
+ ; convert to signed
+ veor d23, d8, d22 ; qs0
+ veor d24, d7, d22 ; ps0
+ veor d25, d6, d22 ; ps1
+ veor d26, d9, d22 ; qs1
+
+ vmov.u8 d27, #3
+
+ vsub.s8 d28, d23, d24 ; ( qs0 - ps0)
+ vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1)
+ vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0)
+ vand d29, d29, d21 ; filter &= hev
+ vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0)
+ vmov.u8 d29, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d28, q15
+
+ vand d28, d28, d19 ; filter &= mask
+
+ vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3)
+ vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4)
+ vshr.s8 d30, d30, #3 ; filter2 >>= 3
+ vshr.s8 d29, d29, #3 ; filter1 >>= 3
+
+
+ vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2)
+ vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1)
+
+ ; outer tap adjustments: ++filter1 >> 1
+ vrshr.s8 d29, d29, #1
+ vbic d29, d29, d21 ; filter &= ~hev
+
+ vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter)
+ vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter)
+
+ veor d24, d24, d22 ; *f_op0 = u^0x80
+ veor d23, d23, d22 ; *f_oq0 = u^0x80
+ veor d25, d25, d22 ; *f_op1 = u^0x80
+ veor d26, d26, d22 ; *f_oq1 = u^0x80
+
+ tst r7, #1
+ bxne lr
+
+ orrs r5, r5, r6 ; Check for 0
+ orreq r7, r7, #2 ; Only do mbfilter branch
+
+ ; mbfilter flat && mask branch
+ ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
+ ; and using vibt on the q's?
+ vmov.u8 d29, #2
+ vaddl.u8 q15, d7, d8 ; op2 = p0 + q0
+ vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3
+ vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+ vaddl.u8 q10, d4, d5
+ vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+ vaddl.u8 q14, d6, d9
+ vqrshrn.u16 d18, q15, #3 ; r_op2
+
+ vsub.i16 q15, q10
+ vaddl.u8 q10, d4, d6
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d7, d10
+ vqrshrn.u16 d19, q15, #3 ; r_op1
+
+ vsub.i16 q15, q10
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d8, d11
+ vqrshrn.u16 d20, q15, #3 ; r_op0
+
+ vsubw.u8 q15, d4 ; oq0 = op0 - p3
+ vsubw.u8 q15, d7 ; oq0 -= p0
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d9, d11
+ vqrshrn.u16 d21, q15, #3 ; r_oq0
+
+ vsubw.u8 q15, d5 ; oq1 = oq0 - p2
+ vsubw.u8 q15, d8 ; oq1 -= q0
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d10, d11
+ vqrshrn.u16 d22, q15, #3 ; r_oq1
+
+ vsubw.u8 q15, d6 ; oq2 = oq0 - p1
+ vsubw.u8 q15, d9 ; oq2 -= q1
+ vadd.i16 q15, q14
+ vqrshrn.u16 d27, q15, #3 ; r_oq2
+
+ ; Filter does not set op2 or oq2, so use p2 and q2.
+ vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask)
+ vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask)
+ vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask)
+ vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask)
+ vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask)
+
+ vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask)
+ vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask)
+
+ tst r7, #2
+ bxne lr
+
+ ; wide_mbfilter flat2 && flat && mask branch
+ vmov.u8 d16, #7
+ vaddl.u8 q15, d7, d8 ; op6 = p0 + q0
+ vaddl.u8 q12, d2, d3
+ vaddl.u8 q13, d4, d5
+ vaddl.u8 q14, d1, d6
+ vmlal.u8 q15, d0, d16 ; op6 += p7 * 3
+ vadd.i16 q12, q13
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d2, d9
+ vadd.i16 q15, q12
+ vaddl.u8 q12, d0, d1
+ vaddw.u8 q15, d1
+ vaddl.u8 q13, d0, d2
+ vadd.i16 q14, q15, q14
+ vqrshrn.u16 d16, q15, #4 ; w_op6
+
+ vsub.i16 q15, q14, q12
+ vaddl.u8 q14, d3, d10
+ vqrshrn.u16 d24, q15, #4 ; w_op5
+
+ vsub.i16 q15, q13
+ vaddl.u8 q13, d0, d3
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d4, d11
+ vqrshrn.u16 d25, q15, #4 ; w_op4
+
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d0, d4
+ vsub.i16 q15, q13
+ vsub.i16 q14, q15, q14
+ vqrshrn.u16 d26, q15, #4 ; w_op3
+
+ vaddw.u8 q15, q14, d5 ; op2 += p2
+ vaddl.u8 q14, d0, d5
+ vaddw.u8 q15, d12 ; op2 += q4
+ vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m)
+ vqrshrn.u16 d27, q15, #4 ; w_op2
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d0, d6
+ vaddw.u8 q15, d6 ; op1 += p1
+ vaddw.u8 q15, d13 ; op1 += q5
+ vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m)
+ vqrshrn.u16 d18, q15, #4 ; w_op1
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d0, d7
+ vaddw.u8 q15, d7 ; op0 += p0
+ vaddw.u8 q15, d14 ; op0 += q6
+ vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m)
+ vqrshrn.u16 d19, q15, #4 ; w_op0
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d1, d8
+ vaddw.u8 q15, d8 ; oq0 += q0
+ vaddw.u8 q15, d15 ; oq0 += q7
+ vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m)
+ vqrshrn.u16 d20, q15, #4 ; w_oq0
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d2, d9
+ vaddw.u8 q15, d9 ; oq1 += q1
+ vaddl.u8 q4, d10, d15
+ vaddw.u8 q15, d15 ; oq1 += q7
+ vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m)
+ vqrshrn.u16 d21, q15, #4 ; w_oq1
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d3, d10
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d11, d15
+ vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m)
+ vqrshrn.u16 d22, q15, #4 ; w_oq2
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d4, d11
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d12, d15
+ vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m)
+ vqrshrn.u16 d23, q15, #4 ; w_oq3
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d5, d12
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d13, d15
+ vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m)
+ vqrshrn.u16 d1, q15, #4 ; w_oq4
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d6, d13
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d14, d15
+ vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m)
+ vqrshrn.u16 d2, q15, #4 ; w_oq5
+
+ vsub.i16 q15, q14
+ vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m)
+ vadd.i16 q15, q4
+ vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m)
+ vqrshrn.u16 d3, q15, #4 ; w_oq6
+ vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m)
+ vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m)
+ vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m)
+
+ bx lr
+ ENDP ; |vpx_wide_mbfilter_neon|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
new file mode 100644
index 0000000000..907e918380
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
@@ -0,0 +1,549 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_lpf_horizontal_4_neon|
+ EXPORT |vpx_lpf_vertical_4_neon|
+ EXPORT |vpx_lpf_horizontal_4_dual_neon|
+ EXPORT |vpx_lpf_vertical_4_dual_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+;
+; void vpx_lpf_horizontal_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|vpx_lpf_horizontal_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r2, [sp, #4] ; load thresh
+ add r1, r1, r1 ; double pitch
+
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+ vld1.8 {d2[]}, [r2] ; duplicate *thresh
+
+ sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ add r3, r2, r1, lsr #1 ; set to 3 lines down
+
+ vld1.u8 {d3}, [r2@64], r1 ; p3
+ vld1.u8 {d4}, [r3@64], r1 ; p2
+ vld1.u8 {d5}, [r2@64], r1 ; p1
+ vld1.u8 {d6}, [r3@64], r1 ; p0
+ vld1.u8 {d7}, [r2@64], r1 ; q0
+ vld1.u8 {d16}, [r3@64], r1 ; q1
+ vld1.u8 {d17}, [r2@64] ; q2
+ vld1.u8 {d18}, [r3@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ bl filter4_8
+
+ vst1.u8 {d4}, [r2@64], r1 ; store op1
+ vst1.u8 {d5}, [r3@64], r1 ; store op0
+ vst1.u8 {d6}, [r2@64], r1 ; store oq0
+ vst1.u8 {d7}, [r3@64], r1 ; store oq1
+
+ pop {pc}
+ ENDP ; |vpx_lpf_horizontal_4_neon|
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+;
+; void vpx_lpf_vertical_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|vpx_lpf_vertical_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+
+ ldr r3, [sp, #4] ; load thresh
+ sub r2, r0, #4 ; move s pointer down by 4 columns
+
+ vld1.8 {d2[]}, [r3] ; duplicate *thresh
+
+ vld1.u8 {d3}, [r2], r1 ; load s data
+ vld1.u8 {d4}, [r2], r1
+ vld1.u8 {d5}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d7}, [r2], r1
+ vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d17}, [r2], r1
+ vld1.u8 {d18}, [r2]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ bl filter4_8
+
+ sub r0, r0, #2
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+ pop {pc}
+ ENDP ; |vpx_lpf_vertical_4_neon|
+
+; void filter4_8();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+;
+; Outputs:
+; d4 op1
+; d5 op0
+; d6 oq0
+; d7 oq1
+|filter4_8| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
+ vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
+ vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
+ vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
+
+ vabd.u8 d17, d6, d7 ; abs(p0 - q0)
+
+ vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
+
+ vmov.u8 d18, #0x80
+
+ vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
+
+ ; hevmask
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
+
+ vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
+
+ veor d7, d7, d18 ; qs0
+
+ vcge.u8 d23, d1, d23 ; abs(m1) > limit
+
+ ; filter() function
+ ; convert to signed
+
+ vshr.u8 d28, d28, #1 ; a = a / 2
+ veor d6, d6, d18 ; ps0
+
+ veor d5, d5, d18 ; ps1
+ vqadd.u8 d17, d17, d28 ; a = b + a
+
+ veor d16, d16, d18 ; qs1
+
+ vmov.u8 d19, #3
+
+ vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
+
+ vcge.u8 d17, d0, d17 ; a > blimit
+
+ vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
+ vorr d22, d21, d22 ; hevmask
+
+ vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
+
+ vand d27, d27, d22 ; filter &= hev
+ vand d23, d23, d17 ; filter_mask
+
+ vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d17, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d27, q12
+
+ vand d27, d27, d23 ; filter &= mask
+
+ vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
+ vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
+ vshr.s8 d28, d28, #3 ; filter2 >>= 3
+ vshr.s8 d27, d27, #3 ; filter1 >>= 3
+
+ vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
+ vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments
+ vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
+
+ veor d6, d26, d18 ; *oq0 = u^0x80
+
+ vbic d27, d27, d22 ; filter &= ~hev
+
+ vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
+ vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
+
+ veor d5, d19, d18 ; *op0 = u^0x80
+ veor d4, d21, d18 ; *op1 = u^0x80
+ veor d7, d20, d18 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |filter4_8|
+
+;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int p,
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp+4 const uint8_t *blimit1,
+; sp+8 const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vpx_lpf_horizontal_4_dual_neon| PROC
+ push {lr}
+
+ ldr r12, [sp, #4] ; load thresh0
+ vld1.8 {d0}, [r2] ; load blimit0 to first half q
+ vld1.8 {d2}, [r3] ; load limit0 to first half q
+
+ add r1, r1, r1 ; double pitch
+ ldr r2, [sp, #8] ; load blimit1
+
+ vld1.8 {d4}, [r12] ; load thresh0 to first half q
+
+ ldr r3, [sp, #12] ; load limit1
+ ldr r12, [sp, #16] ; load thresh1
+ vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q
+
+ sub r2, r0, r1, lsl #1 ; s[-4 * p]
+
+ vld1.8 {d3}, [r3] ; load limit1 to 2nd half q
+ vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q
+
+ vpush {d8-d15} ; save neon registers
+
+ add r3, r2, r1, lsr #1 ; s[-3 * p]
+
+ vld1.u8 {q3}, [r2@64], r1 ; p3
+ vld1.u8 {q4}, [r3@64], r1 ; p2
+ vld1.u8 {q5}, [r2@64], r1 ; p1
+ vld1.u8 {q6}, [r3@64], r1 ; p0
+ vld1.u8 {q7}, [r2@64], r1 ; q0
+ vld1.u8 {q8}, [r3@64], r1 ; q1
+ vld1.u8 {q9}, [r2@64] ; q2
+ vld1.u8 {q10}, [r3@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ bl filter4_16
+
+ vst1.u8 {q5}, [r2@64], r1 ; store op1
+ vst1.u8 {q6}, [r3@64], r1 ; store op0
+ vst1.u8 {q7}, [r2@64], r1 ; store oq0
+ vst1.u8 {q8}, [r3@64], r1 ; store oq1
+
+ vpop {d8-d15} ; restore neon registers
+
+ pop {pc}
+ ENDP ; |vpx_lpf_horizontal_4_dual_neon|
+
+;void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int p,
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp+4 const uint8_t *blimit1,
+; sp+8 const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vpx_lpf_vertical_4_dual_neon| PROC
+ push {lr}
+
+ ldr r12, [sp, #4] ; load thresh0
+ vld1.8 {d0}, [r2] ; load blimit0 to first half q
+ vld1.8 {d2}, [r3] ; load limit0 to first half q
+
+ ldr r2, [sp, #8] ; load blimit1
+
+ vld1.8 {d4}, [r12] ; load thresh0 to first half q
+
+ ldr r3, [sp, #12] ; load limit1
+ ldr r12, [sp, #16] ; load thresh1
+ vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q
+
+ sub r2, r0, #4 ; s[-4]
+
+ vld1.8 {d3}, [r3] ; load limit1 to 2nd half q
+ vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q
+
+ vpush {d8-d15} ; save neon registers
+
+ vld1.u8 {d6}, [r2], r1 ; 00 01 02 03 04 05 06 07
+ vld1.u8 {d8}, [r2], r1 ; 10 11 12 13 14 15 16 17
+ vld1.u8 {d10}, [r2], r1 ; 20 21 22 23 24 25 26 27
+ vld1.u8 {d12}, [r2], r1 ; 30 31 32 33 34 35 36 37
+ vld1.u8 {d14}, [r2], r1 ; 40 41 42 43 44 45 46 47
+ vld1.u8 {d16}, [r2], r1 ; 50 51 52 53 54 55 56 57
+ vld1.u8 {d18}, [r2], r1 ; 60 61 62 63 64 65 66 67
+ vld1.u8 {d20}, [r2], r1 ; 70 71 72 73 74 75 76 77
+ vld1.u8 {d7}, [r2], r1 ; 80 81 82 83 84 85 86 87
+ vld1.u8 {d9}, [r2], r1 ; 90 91 92 93 94 95 96 97
+ vld1.u8 {d11}, [r2], r1 ; A0 A1 A2 A3 A4 A5 A6 A7
+ vld1.u8 {d13}, [r2], r1 ; B0 B1 B2 B3 B4 B5 B6 B7
+ vld1.u8 {d15}, [r2], r1 ; C0 C1 C2 C3 C4 C5 C6 C7
+ vld1.u8 {d17}, [r2], r1 ; D0 D1 D2 D3 D4 D5 D6 D7
+ vld1.u8 {d19}, [r2], r1 ; E0 E1 E2 E3 E4 E5 E6 E7
+ vld1.u8 {d21}, [r2] ; F0 F1 F2 F3 F4 F5 F6 F7
+
+ vtrn.8 q3, q4 ; q3 : 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
+ ; q4 : 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
+ vtrn.8 q5, q6 ; q5 : 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6
+ ; q6 : 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7
+ vtrn.8 q7, q8 ; q7 : 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6
+ ; q8 : 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7
+ vtrn.8 q9, q10 ; q9 : 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6
+ ; q10: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7
+
+ vtrn.16 q3, q5 ; q3 : 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4
+ ; q5 : 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6
+ vtrn.16 q4, q6 ; q4 : 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5
+ ; q6 : 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7
+ vtrn.16 q7, q9 ; q7 : 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4
+ ; q9 : 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6
+ vtrn.16 q8, q10 ; q8 : 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5
+ ; q10: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7
+
+ vtrn.32 q3, q7 ; q3 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
+ ; q7 : 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
+ vtrn.32 q5, q9 ; q5 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
+ ; q9 : 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
+ vtrn.32 q4, q8 ; q4 : 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
+ ; q8 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
+ vtrn.32 q6, q10 ; q6 : 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
+ ; q10: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
+
+ bl filter4_16
+
+ sub r0, #2
+
+ vmov d0, d11
+ vmov d1, d13
+ vmov d2, d15
+ vmov d3, d17
+ vmov d11, d12
+ vmov d12, d14
+ vmov d13, d16
+ vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+ vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+ vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+ vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+ vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+ vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+ vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+ vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r0], r1
+ vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r0], r1
+ vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r0], r1
+ vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r0]
+
+ vpop {d8-d15} ; restore neon registers
+
+ pop {pc}
+ ENDP ; |vpx_lpf_vertical_4_dual_neon|
+
+; void filter4_16();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. This function uses
+; registers d8-d15, so the calling function must save those registers.
+;
+; r0-r3, r12 PRESERVE
+; q0 blimit
+; q1 limit
+; q2 thresh
+; q3 p3
+; q4 p2
+; q5 p1
+; q6 p0
+; q7 q0
+; q8 q1
+; q9 q2
+; q10 q3
+;
+; Outputs:
+; q5 op1
+; q6 op0
+; q7 oq0
+; q8 oq1
+|filter4_16| PROC
+
+ ; filter_mask
+ vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2)
+ vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1)
+ vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0)
+ vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0)
+ vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1)
+ vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 q11, q11, q12 ; m7 = max(m1, m2)
+ vmax.u8 q12, q13, q14 ; m8 = max(m3, m4)
+
+ vabd.u8 q9, q6, q7 ; abs(p0 - q0)
+
+ vmax.u8 q3, q3, q4 ; m9 = max(m5, m6)
+
+ vmov.u8 q10, #0x80
+
+ vmax.u8 q15, q11, q12 ; m10 = max(m7, m8)
+
+ vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 q15, q15, q3 ; m11 = max(m10, m9)
+
+ vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
+ vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
+
+ veor q7, q7, q10 ; qs0
+
+ vcge.u8 q15, q1, q15 ; abs(m11) > limit
+
+ vshr.u8 q2, q2, #1 ; a = a / 2
+ veor q6, q6, q10 ; ps0
+
+ veor q5, q5, q10 ; ps1
+ vqadd.u8 q9, q9, q2 ; a = b + a
+
+ veor q8, q8, q10 ; qs1
+
+ vmov.u16 q4, #3
+
+ vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
+ vsubl.s8 q11, d15, d13
+
+ vcge.u8 q9, q0, q9 ; a > blimit
+
+ vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1)
+ vorr q14, q13, q14 ; hev
+
+ vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
+ vmul.i16 q11, q11, q4
+
+ vand q1, q1, q14 ; filter &= hev
+ vand q15, q15, q9 ; mask
+
+ vmov.u8 q4, #3
+
+ vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0)
+ vaddw.s8 q11, q11, d3
+
+ vmov.u8 q9, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d2, q2
+ vqmovn.s16 d3, q11
+ vand q1, q1, q15 ; filter &= mask
+
+ vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3)
+ vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4)
+ vshr.s8 q2, q2, #3 ; filter2 >>= 3
+ vshr.s8 q1, q1, #3 ; filter1 >>= 3
+
+
+ vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2)
+ vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments
+ vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1
+
+ veor q7, q0, q10 ; *oq0 = u^0x80
+
+ vbic q1, q1, q14 ; filter &= ~hev
+
+ vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter)
+ vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter)
+
+ veor q6, q11, q10 ; *op0 = u^0x80
+ veor q5, q13, q10 ; *op1 = u^0x80
+ veor q8, q12, q10 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |filter4_16|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
new file mode 100644
index 0000000000..a81a9d1013
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -0,0 +1,491 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_lpf_horizontal_8_neon|
+ EXPORT |vpx_lpf_horizontal_8_dual_neon|
+ EXPORT |vpx_lpf_vertical_8_neon|
+ EXPORT |vpx_lpf_vertical_8_dual_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+;
+; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|vpx_lpf_horizontal_8_neon| PROC
+ push {r4-r5, lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r2, [sp, #12] ; load thresh
+ add r1, r1, r1 ; double pitch
+
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+ vld1.8 {d2[]}, [r2] ; duplicate *thresh
+
+ sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ add r2, r3, r1, lsr #1 ; set to 3 lines down
+
+ vld1.u8 {d3}, [r3@64], r1 ; p3
+ vld1.u8 {d4}, [r2@64], r1 ; p2
+ vld1.u8 {d5}, [r3@64], r1 ; p1
+ vld1.u8 {d6}, [r2@64], r1 ; p0
+ vld1.u8 {d7}, [r3@64], r1 ; q0
+ vld1.u8 {d16}, [r2@64], r1 ; q1
+ vld1.u8 {d17}, [r3@64] ; q2
+ vld1.u8 {d18}, [r2@64], r1 ; q3
+
+ sub r3, r3, r1, lsl #1
+ sub r2, r2, r1, lsl #2
+
+ bl vpx_mbloop_filter_neon
+
+ vst1.u8 {d0}, [r2@64], r1 ; store op2
+ vst1.u8 {d1}, [r3@64], r1 ; store op1
+ vst1.u8 {d2}, [r2@64], r1 ; store op0
+ vst1.u8 {d3}, [r3@64], r1 ; store oq0
+ vst1.u8 {d4}, [r2@64], r1 ; store oq1
+ vst1.u8 {d5}, [r3@64], r1 ; store oq2
+
+ pop {r4-r5, pc}
+
+ ENDP ; |vpx_lpf_horizontal_8_neon|
+
+;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s,
+; int p,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp + 4 const uint8_t *blimit1,
+; sp + 8 const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_horizontal_8_dual_neon| PROC
+ push {r0-r1, lr}
+ ldr lr, [sp, #12]
+ push {lr} ; thresh0
+ bl vpx_lpf_horizontal_8_neon
+
+ ldr r2, [sp, #20] ; blimit1
+ ldr r3, [sp, #24] ; limit1
+ ldr lr, [sp, #28]
+ str lr, [sp, #16] ; thresh1
+ add sp, #4
+ pop {r0-r1, lr}
+ add r0, #8 ; s + 8
+ b vpx_lpf_horizontal_8_neon
+ ENDP ; |vpx_lpf_horizontal_8_dual_neon|
+
+; void vpx_lpf_vertical_8_neon(uint8_t *s,
+; int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+;
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|vpx_lpf_vertical_8_neon| PROC
+ push {r4-r5, lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+
+ ldr r3, [sp, #12] ; load thresh
+ sub r2, r0, #4 ; move s pointer down by 4 columns
+
+ vld1.8 {d2[]}, [r3] ; duplicate *thresh
+
+ vld1.u8 {d3}, [r2], r1 ; load s data
+ vld1.u8 {d4}, [r2], r1
+ vld1.u8 {d5}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d7}, [r2], r1
+ vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d17}, [r2], r1
+ vld1.u8 {d18}, [r2]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ sub r2, r0, #3
+ add r3, r0, #1
+
+ bl vpx_mbloop_filter_neon
+
+ ;store op2, op1, op0, oq0
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
+ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
+ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
+ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
+ vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
+ vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
+ vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
+ vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2]
+
+ ;store oq1, oq2
+ vst2.8 {d4[0], d5[0]}, [r3], r1
+ vst2.8 {d4[1], d5[1]}, [r3], r1
+ vst2.8 {d4[2], d5[2]}, [r3], r1
+ vst2.8 {d4[3], d5[3]}, [r3], r1
+ vst2.8 {d4[4], d5[4]}, [r3], r1
+ vst2.8 {d4[5], d5[5]}, [r3], r1
+ vst2.8 {d4[6], d5[6]}, [r3], r1
+ vst2.8 {d4[7], d5[7]}, [r3]
+
+ pop {r4-r5, pc}
+ ENDP ; |vpx_lpf_vertical_8_neon|
+
+;void vpx_lpf_vertical_8_dual_neon(uint8_t *s,
+; int pitch,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int pitch
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp + 4 const uint8_t *blimit1,
+; sp + 8 const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_vertical_8_dual_neon| PROC
+ push {r0-r1, lr}
+ ldr lr, [sp, #12]
+ push {lr} ; thresh0
+ bl vpx_lpf_vertical_8_neon
+
+ ldr r2, [sp, #20] ; blimit1
+ ldr r3, [sp, #24] ; limit1
+ ldr lr, [sp, #28]
+ str lr, [sp, #16] ; thresh1
+ add sp, #4
+ pop {r0-r1, lr}
+ add r0, r0, r1, lsl #3 ; s + 8 * pitch
+ b vpx_lpf_vertical_8_neon
+ ENDP ; |vpx_lpf_vertical_8_dual_neon|
+
+; void vpx_mbloop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+;
+; Outputs:
+; d0 op2
+; d1 op1
+; d2 op0
+; d3 oq0
+; d4 oq1
+; d5 oq2
+|vpx_mbloop_filter_neon| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
+ vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1)
+ vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
+ vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
+
+ vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2)
+
+ vmax.u8 d23, d23, d24 ; m3 = max(m5, m6)
+
+ vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2)
+
+ vmax.u8 d19, d19, d20
+
+ vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0)
+ vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0)
+ vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0)
+
+ vmax.u8 d19, d19, d23
+
+ vabd.u8 d23, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2
+
+ ; abs () > limit
+ vcge.u8 d19, d1, d19
+
+ ; only compare the largest value to thresh
+ vmax.u8 d25, d25, d26 ; m4 = max(m7, m8)
+ vmax.u8 d26, d27, d28 ; m5 = max(m10, m11)
+
+ vshr.u8 d23, d23, #1 ; a = a / 2
+
+ vmax.u8 d25, d25, d26 ; m4 = max(m4, m5)
+
+ vqadd.u8 d24, d24, d23 ; a = b + a
+
+ vmax.u8 d20, d20, d25 ; m2 = max(m2, m4)
+
+ vmov.u8 d23, #1
+ vcge.u8 d24, d0, d24 ; a > blimit
+
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+
+ vcge.u8 d20, d23, d20 ; flat
+
+ vand d19, d19, d24 ; mask
+
+ vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+
+ vand d20, d20, d19 ; flat & mask
+
+ vmov.u8 d22, #0x80
+
+ vorr d23, d21, d23 ; hev
+
+ ; This instruction will truncate the "flat & mask" masks down to 4 bits
+ ; each to fit into one 32 bit arm register. The values are stored in
+ ; q10.64[0].
+ vshrn.u16 d30, q10, #4
+ vmov.u32 r4, d30[0] ; flat & mask 4bits
+
+ adds r5, r4, #1 ; Check for all 1's
+
+ ; If mask and flat are 1's for all vectors, then we only need to execute
+ ; the power branch for all vectors.
+ beq power_branch_only
+
+ cmp r4, #0 ; Check for 0, set flag for later
+
+ ; mbfilter() function
+ ; filter() function
+ ; convert to signed
+ veor d21, d7, d22 ; qs0
+ veor d24, d6, d22 ; ps0
+ veor d25, d5, d22 ; ps1
+ veor d26, d16, d22 ; qs1
+
+ vmov.u8 d27, #3
+
+ vsub.s8 d28, d21, d24 ; ( qs0 - ps0)
+
+ vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1)
+
+ vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0)
+
+ vand d29, d29, d23 ; filter &= hev
+
+ vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d29, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d28, q15
+
+ vand d28, d28, d19 ; filter &= mask
+
+ vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3)
+ vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4)
+ vshr.s8 d30, d30, #3 ; filter2 >>= 3
+ vshr.s8 d29, d29, #3 ; filter1 >>= 3
+
+ vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2)
+ vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1)
+
+ ; outer tap adjustments: ++filter1 >> 1
+ vrshr.s8 d29, d29, #1
+ vbic d29, d29, d23 ; filter &= ~hev
+
+ vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter)
+ vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter)
+
+ ; If mask and flat are 0's for all vectors, then we only need to execute
+ ; the filter branch for all vectors.
+ beq filter_branch_only
+
+ ; If mask and flat are mixed then we must perform both branches and
+ ; combine the data.
+ veor d24, d24, d22 ; *f_op0 = u^0x80
+ veor d21, d21, d22 ; *f_oq0 = u^0x80
+ veor d25, d25, d22 ; *f_op1 = u^0x80
+ veor d26, d26, d22 ; *f_oq1 = u^0x80
+
+ ; At this point we have already executed the filter branch. The filter
+ ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
+ ; branch and combine the data.
+ vmov.u8 d23, #2
+ vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0
+ vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3
+ vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2
+
+ vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask)
+
+ vaddw.u8 q14, d5 ; r_op2 += p1
+
+ vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask)
+
+ vqrshrn.u16 d30, q14, #3 ; r_op2
+
+ vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3
+ vsubw.u8 q14, d4 ; r_op1 -= p2
+ vaddw.u8 q14, d5 ; r_op1 += p1
+ vaddw.u8 q14, d16 ; r_op1 += q1
+
+ vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask)
+
+ vqrshrn.u16 d31, q14, #3 ; r_op1
+
+ vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3
+ vsubw.u8 q14, d5 ; r_op0 -= p1
+ vaddw.u8 q14, d6 ; r_op0 += p0
+ vaddw.u8 q14, d17 ; r_op0 += q2
+
+ vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask)
+
+ vqrshrn.u16 d23, q14, #3 ; r_op0
+
+ vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3
+ vsubw.u8 q14, d6 ; r_oq0 -= p0
+ vaddw.u8 q14, d7 ; r_oq0 += q0
+
+ vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask)
+
+ vaddw.u8 q14, d18 ; oq0 += q3
+
+ vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask)
+
+ vqrshrn.u16 d22, q14, #3 ; r_oq0
+
+ vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2
+ vsubw.u8 q14, d7 ; r_oq1 -= q0
+ vaddw.u8 q14, d16 ; r_oq1 += q1
+
+ vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask)
+
+ vaddw.u8 q14, d18 ; r_oq1 += q3
+
+ vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask)
+
+ vqrshrn.u16 d6, q14, #3 ; r_oq1
+
+ vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1
+ vsubw.u8 q14, d16 ; r_oq2 -= q1
+ vaddw.u8 q14, d17 ; r_oq2 += q2
+ vaddw.u8 q14, d18 ; r_oq2 += q3
+
+ vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask)
+
+ vqrshrn.u16 d7, q14, #3 ; r_oq2
+
+ vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask)
+ vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask)
+ vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask)
+
+ bx lr
+
+power_branch_only
+ vmov.u8 d27, #3
+ vmov.u8 d21, #2
+ vaddl.u8 q14, d6, d7 ; op2 = p0 + q0
+ vmlal.u8 q14, d3, d27 ; op2 += p3 * 3
+ vmlal.u8 q14, d4, d21 ; op2 += p2 * 2
+ vaddw.u8 q14, d5 ; op2 += p1
+ vqrshrn.u16 d0, q14, #3 ; op2
+
+ vsubw.u8 q14, d3 ; op1 = op2 - p3
+ vsubw.u8 q14, d4 ; op1 -= p2
+ vaddw.u8 q14, d5 ; op1 += p1
+ vaddw.u8 q14, d16 ; op1 += q1
+ vqrshrn.u16 d1, q14, #3 ; op1
+
+ vsubw.u8 q14, d3 ; op0 = op1 - p3
+ vsubw.u8 q14, d5 ; op0 -= p1
+ vaddw.u8 q14, d6 ; op0 += p0
+ vaddw.u8 q14, d17 ; op0 += q2
+ vqrshrn.u16 d2, q14, #3 ; op0
+
+ vsubw.u8 q14, d3 ; oq0 = op0 - p3
+ vsubw.u8 q14, d6 ; oq0 -= p0
+ vaddw.u8 q14, d7 ; oq0 += q0
+ vaddw.u8 q14, d18 ; oq0 += q3
+ vqrshrn.u16 d3, q14, #3 ; oq0
+
+ vsubw.u8 q14, d4 ; oq1 = oq0 - p2
+ vsubw.u8 q14, d7 ; oq1 -= q0
+ vaddw.u8 q14, d16 ; oq1 += q1
+ vaddw.u8 q14, d18 ; oq1 += q3
+ vqrshrn.u16 d4, q14, #3 ; oq1
+
+ vsubw.u8 q14, d5 ; oq2 = oq1 - p1
+ vsubw.u8 q14, d16 ; oq2 -= q1
+ vaddw.u8 q14, d17 ; oq2 += q2
+ vaddw.u8 q14, d18 ; oq2 += q3
+ vqrshrn.u16 d5, q14, #3 ; oq2
+
+ bx lr
+
+filter_branch_only
+ ; TODO(fgalligan): See if we can rearange registers so we do not need to
+ ; do the 2 vswp.
+ vswp d0, d4 ; op2
+ vswp d5, d17 ; oq2
+ veor d2, d24, d22 ; *op0 = u^0x80
+ veor d3, d21, d22 ; *oq0 = u^0x80
+ veor d1, d25, d22 ; *op1 = u^0x80
+ veor d4, d26, d22 ; *oq1 = u^0x80
+
+ bx lr
+
+ ENDP ; |vpx_mbloop_filter_neon|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
new file mode 100644
index 0000000000..c54e588239
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,1107 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// For all the static inline functions, the functions ending with '_8' process
+// 8 samples in a bunch, and the functions ending with '_16' process 16 samples
+// in a bunch.
+
+#define FUN_LOAD_THRESH(w, r) \
+ static INLINE void load_thresh_##w( \
+ const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \
+ uint8x##w##_t *blimit_vec, uint8x##w##_t *limit_vec, \
+ uint8x##w##_t *thresh_vec) { \
+ *blimit_vec = vld1##r##dup_u8(blimit); \
+ *limit_vec = vld1##r##dup_u8(limit); \
+ *thresh_vec = vld1##r##dup_u8(thresh); \
+ }
+
+FUN_LOAD_THRESH(8, _) // load_thresh_8
+FUN_LOAD_THRESH(16, q_) // load_thresh_16
+#undef FUN_LOAD_THRESH
+
+static INLINE void load_thresh_8_dual(
+ const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,
+ uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) {
+ *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1));
+ *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1));
+ *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1));
+}
+
+// Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a
+// pixel. When used to control filter branches, we only detect whether it is all
+// 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
+// flat equals 0 if and only if flat_status equals 0.
+// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
+// because each mask occupies more than 1 bit.)
+static INLINE uint32_t calc_flat_status_8(uint8x8_t flat) {
+ return vget_lane_u32(
+ vreinterpret_u32_u64(vpaddl_u32(vreinterpret_u32_u8(flat))), 0);
+}
+
+// Here flat is 128-bit long, with each 8-bit chunk being a mask of a pixel.
+// When used to control filter branches, we only detect whether it is all 0s or
+// all 1s. We narrowing shift right each 16-bit chunk by 4 arithmetically, so
+// we get a 64-bit long number, with each 4-bit chunk being a mask of a pixel.
+// Then we pairwise add flat to a 32-bit long number flat_status.
+// flat equals 0 if and only if flat_status equals 0.
+// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
+// because each mask occupies more than 1 bit.)
+static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
+ const uint8x8_t flat_4bit =
+ vreinterpret_u8_s8(vshrn_n_s16(vreinterpretq_s16_u8(flat), 4));
+ return calc_flat_status_8(flat_4bit);
+}
+
+#define FUN_FILTER_HEV_MASK4(w, r) \
+ static INLINE uint8x##w##_t filter_hev_mask4_##w( \
+ const uint8x##w##_t limit, const uint8x##w##_t blimit, \
+ const uint8x##w##_t thresh, const uint8x##w##_t p3, \
+ const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
+ const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
+ const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) { \
+ uint8x##w##_t max, t0, t1; \
+ \
+ max = vabd##r##u8(p1, p0); \
+ max = vmax##r##u8(max, vabd##r##u8(q1, q0)); \
+ *hev = vcgt##r##u8(max, thresh); \
+ *mask = vmax##r##u8(max, vabd##r##u8(p3, p2)); \
+ *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1)); \
+ *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1)); \
+ *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2)); \
+ t0 = vabd##r##u8(p0, q0); \
+ t1 = vabd##r##u8(p1, q1); \
+ t0 = vqadd##r##u8(t0, t0); \
+ t1 = vshr##r##n_u8(t1, 1); \
+ t0 = vqadd##r##u8(t0, t1); \
+ *mask = vcle##r##u8(*mask, limit); \
+ t0 = vcle##r##u8(t0, blimit); \
+ *mask = vand##r##u8(*mask, t0); \
+ \
+ return max; \
+ }
+
+FUN_FILTER_HEV_MASK4(8, _) // filter_hev_mask4_8
+FUN_FILTER_HEV_MASK4(16, q_) // filter_hev_mask4_16
+#undef FUN_FILTER_HEV_MASK4
+
+#define FUN_FILTER_FLAT_HEV_MASK(w, r) \
+ static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \
+ const uint8x##w##_t limit, const uint8x##w##_t blimit, \
+ const uint8x##w##_t thresh, const uint8x##w##_t p3, \
+ const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
+ const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
+ const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \
+ uint8x##w##_t *hev) { \
+ uint8x##w##_t max, mask; \
+ \
+ max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \
+ q2, q3, hev, &mask); \
+ *flat = vmax##r##u8(max, vabd##r##u8(p2, p0)); \
+ *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0)); \
+ *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0)); \
+ *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0)); \
+ *flat = vcle##r##u8(*flat, vdup##r##n_u8(1)); /* flat_mask4() */ \
+ *flat = vand##r##u8(*flat, mask); \
+ *flat_status = calc_flat_status_##w(*flat); \
+ \
+ return mask; \
+ }
+
+FUN_FILTER_FLAT_HEV_MASK(8, _) // filter_flat_hev_mask_8
+FUN_FILTER_FLAT_HEV_MASK(16, q_) // filter_flat_hev_mask_16
+#undef FUN_FILTER_FLAT_HEV_MASK
+
+#define FUN_FLAT_MASK5(w, r) \
+ static INLINE uint8x##w##_t flat_mask5_##w( \
+ const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
+ const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+ const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
+ const uint8x##w##_t q4, const uint8x##w##_t flat, \
+ uint32_t *flat2_status) { \
+ uint8x##w##_t flat2 = vabd##r##u8(p4, p0); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(p3, p0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(p2, p0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(p1, p0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(q1, q0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(q2, q0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(q3, q0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(q4, q0)); \
+ flat2 = vcle##r##u8(flat2, vdup##r##n_u8(1)); \
+ flat2 = vand##r##u8(flat2, flat); \
+ *flat2_status = calc_flat_status_##w(flat2); \
+ \
+ return flat2; \
+ }
+
+FUN_FLAT_MASK5(8, _) // flat_mask5_8
+FUN_FLAT_MASK5(16, q_) // flat_mask5_16
+#undef FUN_FLAT_MASK5
+
+#define FUN_FLIP_SIGN(w, r) \
+ static INLINE int8x##w##_t flip_sign_##w(const uint8x##w##_t v) { \
+ const uint8x##w##_t sign_bit = vdup##r##n_u8(0x80); \
+ return vreinterpret##r##s8_u8(veor##r##u8(v, sign_bit)); \
+ }
+
+FUN_FLIP_SIGN(8, _) // flip_sign_8
+FUN_FLIP_SIGN(16, q_) // flip_sign_16
+#undef FUN_FLIP_SIGN
+
+#define FUN_FLIP_SIGN_BACK(w, r) \
+ static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
+ const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \
+ return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \
+ }
+
+FUN_FLIP_SIGN_BACK(8, _) // flip_sign_back_8
+FUN_FLIP_SIGN_BACK(16, q_) // flip_sign_back_16
+#undef FUN_FLIP_SIGN_BACK
+
+static INLINE void filter_update_8(const uint8x8_t sub0, const uint8x8_t sub1,
+ const uint8x8_t add0, const uint8x8_t add1,
+ uint16x8_t *sum) {
+ *sum = vsubw_u8(*sum, sub0);
+ *sum = vsubw_u8(*sum, sub1);
+ *sum = vaddw_u8(*sum, add0);
+ *sum = vaddw_u8(*sum, add1);
+}
+
+static INLINE void filter_update_16(const uint8x16_t sub0,
+ const uint8x16_t sub1,
+ const uint8x16_t add0,
+ const uint8x16_t add1, uint16x8_t *sum0,
+ uint16x8_t *sum1) {
+ *sum0 = vsubw_u8(*sum0, vget_low_u8(sub0));
+ *sum1 = vsubw_u8(*sum1, vget_high_u8(sub0));
+ *sum0 = vsubw_u8(*sum0, vget_low_u8(sub1));
+ *sum1 = vsubw_u8(*sum1, vget_high_u8(sub1));
+ *sum0 = vaddw_u8(*sum0, vget_low_u8(add0));
+ *sum1 = vaddw_u8(*sum1, vget_high_u8(add0));
+ *sum0 = vaddw_u8(*sum0, vget_low_u8(add1));
+ *sum1 = vaddw_u8(*sum1, vget_high_u8(add1));
+}
+
+static INLINE uint8x8_t calc_7_tap_filter_8_kernel(const uint8x8_t sub0,
+ const uint8x8_t sub1,
+ const uint8x8_t add0,
+ const uint8x8_t add1,
+ uint16x8_t *sum) {
+ filter_update_8(sub0, sub1, add0, add1, sum);
+ return vrshrn_n_u16(*sum, 3);
+}
+
+static INLINE uint8x16_t calc_7_tap_filter_16_kernel(
+ const uint8x16_t sub0, const uint8x16_t sub1, const uint8x16_t add0,
+ const uint8x16_t add1, uint16x8_t *sum0, uint16x8_t *sum1) {
+ filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
+ return vcombine_u8(vrshrn_n_u16(*sum0, 3), vrshrn_n_u16(*sum1, 3));
+}
+
+static INLINE uint8x8_t apply_15_tap_filter_8_kernel(
+ const uint8x8_t flat, const uint8x8_t sub0, const uint8x8_t sub1,
+ const uint8x8_t add0, const uint8x8_t add1, const uint8x8_t in,
+ uint16x8_t *sum) {
+ filter_update_8(sub0, sub1, add0, add1, sum);
+ return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in);
+}
+
+static INLINE uint8x16_t apply_15_tap_filter_16_kernel(
+ const uint8x16_t flat, const uint8x16_t sub0, const uint8x16_t sub1,
+ const uint8x16_t add0, const uint8x16_t add1, const uint8x16_t in,
+ uint16x8_t *sum0, uint16x8_t *sum1) {
+ uint8x16_t t;
+ filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
+ t = vcombine_u8(vrshrn_n_u16(*sum0, 4), vrshrn_n_u16(*sum1, 4));
+ return vbslq_u8(flat, t, in);
+}
+
+// 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+static INLINE void calc_7_tap_filter_8(const uint8x8_t p3, const uint8x8_t p2,
+ const uint8x8_t p1, const uint8x8_t p0,
+ const uint8x8_t q0, const uint8x8_t q1,
+ const uint8x8_t q2, const uint8x8_t q3,
+ uint8x8_t *op2, uint8x8_t *op1,
+ uint8x8_t *op0, uint8x8_t *oq0,
+ uint8x8_t *oq1, uint8x8_t *oq2) {
+ uint16x8_t sum;
+ sum = vaddl_u8(p3, p3); // 2*p3
+ sum = vaddw_u8(sum, p3); // 3*p3
+ sum = vaddw_u8(sum, p2); // 3*p3+p2
+ sum = vaddw_u8(sum, p2); // 3*p3+2*p2
+ sum = vaddw_u8(sum, p1); // 3*p3+2*p2+p1
+ sum = vaddw_u8(sum, p0); // 3*p3+2*p2+p1+p0
+ sum = vaddw_u8(sum, q0); // 3*p3+2*p2+p1+p0+q0
+ *op2 = vrshrn_n_u16(sum, 3);
+ *op1 = calc_7_tap_filter_8_kernel(p3, p2, p1, q1, &sum);
+ *op0 = calc_7_tap_filter_8_kernel(p3, p1, p0, q2, &sum);
+ *oq0 = calc_7_tap_filter_8_kernel(p3, p0, q0, q3, &sum);
+ *oq1 = calc_7_tap_filter_8_kernel(p2, q0, q1, q3, &sum);
+ *oq2 = calc_7_tap_filter_8_kernel(p1, q1, q2, q3, &sum);
+}
+
+static INLINE void calc_7_tap_filter_16(
+ const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t p1,
+ const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1,
+ const uint8x16_t q2, const uint8x16_t q3, uint8x16_t *op2, uint8x16_t *op1,
+ uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2) {
+ uint16x8_t sum0, sum1;
+ sum0 = vaddl_u8(vget_low_u8(p3), vget_low_u8(p3)); // 2*p3
+ sum1 = vaddl_u8(vget_high_u8(p3), vget_high_u8(p3)); // 2*p3
+ sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 3*p3
+ sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 3*p3
+ sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+p2
+ sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+p2
+ sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+2*p2
+ sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+2*p2
+ sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 3*p3+2*p2+p1
+ sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 3*p3+2*p2+p1
+ sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 3*p3+2*p2+p1+p0
+ sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 3*p3+2*p2+p1+p0
+ sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 3*p3+2*p2+p1+p0+q0
+ sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 3*p3+2*p2+p1+p0+q0
+ *op2 = vcombine_u8(vrshrn_n_u16(sum0, 3), vrshrn_n_u16(sum1, 3));
+ *op1 = calc_7_tap_filter_16_kernel(p3, p2, p1, q1, &sum0, &sum1);
+ *op0 = calc_7_tap_filter_16_kernel(p3, p1, p0, q2, &sum0, &sum1);
+ *oq0 = calc_7_tap_filter_16_kernel(p3, p0, q0, q3, &sum0, &sum1);
+ *oq1 = calc_7_tap_filter_16_kernel(p2, q0, q1, q3, &sum0, &sum1);
+ *oq2 = calc_7_tap_filter_16_kernel(p1, q1, q2, q3, &sum0, &sum1);
+}
+
+#define FUN_APPLY_7_TAP_FILTER(w, r) \
+ static INLINE void apply_7_tap_filter_##w( \
+ const uint8x##w##_t flat, const uint8x##w##_t p3, \
+ const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
+ const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
+ const uint8x##w##_t q3, uint8x##w##_t *op2, uint8x##w##_t *op1, \
+ uint8x##w##_t *op0, uint8x##w##_t *oq0, uint8x##w##_t *oq1, \
+ uint8x##w##_t *oq2) { \
+ uint8x##w##_t tp1, tp0, tq0, tq1; \
+ calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, \
+ &tq0, &tq1, oq2); \
+ *op2 = vbsl##r##u8(flat, *op2, p2); \
+ *op1 = vbsl##r##u8(flat, tp1, *op1); \
+ *op0 = vbsl##r##u8(flat, tp0, *op0); \
+ *oq0 = vbsl##r##u8(flat, tq0, *oq0); \
+ *oq1 = vbsl##r##u8(flat, tq1, *oq1); \
+ *oq2 = vbsl##r##u8(flat, *oq2, q2); \
+ }
+
+FUN_APPLY_7_TAP_FILTER(8, _) // apply_7_tap_filter_8
+FUN_APPLY_7_TAP_FILTER(16, q_) // apply_7_tap_filter_16
+#undef FUN_APPLY_7_TAP_FILTER
+
+// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+static INLINE void apply_15_tap_filter_8(
+ const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6,
+ const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3,
+ const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0,
+ const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2,
+ const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5,
+ const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5,
+ uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1,
+ uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2,
+ uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) {
+ uint16x8_t sum;
+ sum = vshll_n_u8(p7, 3); // 8*p7
+ sum = vsubw_u8(sum, p7); // 7*p7
+ sum = vaddw_u8(sum, p6); // 7*p7+p6
+ sum = vaddw_u8(sum, p6); // 7*p7+2*p6
+ sum = vaddw_u8(sum, p5); // 7*p7+2*p6+p5
+ sum = vaddw_u8(sum, p4); // 7*p7+2*p6+p5+p4
+ sum = vaddw_u8(sum, p3); // 7*p7+2*p6+p5+p4+p3
+ sum = vaddw_u8(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2
+ sum = vaddw_u8(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1
+ sum = vaddw_u8(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+ sum = vaddw_u8(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+ *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6);
+ *op5 = apply_15_tap_filter_8_kernel(flat2, p7, p6, p5, q1, p5, &sum);
+ *op4 = apply_15_tap_filter_8_kernel(flat2, p7, p5, p4, q2, p4, &sum);
+ *op3 = apply_15_tap_filter_8_kernel(flat2, p7, p4, p3, q3, p3, &sum);
+ *op2 = apply_15_tap_filter_8_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
+ *op1 = apply_15_tap_filter_8_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
+ *op0 = apply_15_tap_filter_8_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
+ *oq0 = apply_15_tap_filter_8_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
+ *oq1 = apply_15_tap_filter_8_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
+ *oq2 = apply_15_tap_filter_8_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
+ *oq3 = apply_15_tap_filter_8_kernel(flat2, p4, q2, q3, q7, q3, &sum);
+ *oq4 = apply_15_tap_filter_8_kernel(flat2, p3, q3, q4, q7, q4, &sum);
+ *oq5 = apply_15_tap_filter_8_kernel(flat2, p2, q4, q5, q7, q5, &sum);
+ *oq6 = apply_15_tap_filter_8_kernel(flat2, p1, q5, q6, q7, q6, &sum);
+}
+
+static INLINE void apply_15_tap_filter_16(
+ const uint8x16_t flat2, const uint8x16_t p7, const uint8x16_t p6,
+ const uint8x16_t p5, const uint8x16_t p4, const uint8x16_t p3,
+ const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
+ const uint8x16_t q3, const uint8x16_t q4, const uint8x16_t q5,
+ const uint8x16_t q6, const uint8x16_t q7, uint8x16_t *op6, uint8x16_t *op5,
+ uint8x16_t *op4, uint8x16_t *op3, uint8x16_t *op2, uint8x16_t *op1,
+ uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2,
+ uint8x16_t *oq3, uint8x16_t *oq4, uint8x16_t *oq5, uint8x16_t *oq6) {
+ uint16x8_t sum0, sum1;
+ uint8x16_t t;
+ sum0 = vshll_n_u8(vget_low_u8(p7), 3); // 8*p7
+ sum1 = vshll_n_u8(vget_high_u8(p7), 3); // 8*p7
+ sum0 = vsubw_u8(sum0, vget_low_u8(p7)); // 7*p7
+ sum1 = vsubw_u8(sum1, vget_high_u8(p7)); // 7*p7
+ sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+p6
+ sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+p6
+ sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+2*p6
+ sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+2*p6
+ sum0 = vaddw_u8(sum0, vget_low_u8(p5)); // 7*p7+2*p6+p5
+ sum1 = vaddw_u8(sum1, vget_high_u8(p5)); // 7*p7+2*p6+p5
+ sum0 = vaddw_u8(sum0, vget_low_u8(p4)); // 7*p7+2*p6+p5+p4
+ sum1 = vaddw_u8(sum1, vget_high_u8(p4)); // 7*p7+2*p6+p5+p4
+ sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 7*p7+2*p6+p5+p4+p3
+ sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 7*p7+2*p6+p5+p4+p3
+ sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2
+ sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2
+ sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1
+ sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1
+ sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+ sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+ sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+ sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+ t = vcombine_u8(vrshrn_n_u16(sum0, 4), vrshrn_n_u16(sum1, 4));
+ *op6 = vbslq_u8(flat2, t, p6);
+ *op5 = apply_15_tap_filter_16_kernel(flat2, p7, p6, p5, q1, p5, &sum0, &sum1);
+ *op4 = apply_15_tap_filter_16_kernel(flat2, p7, p5, p4, q2, p4, &sum0, &sum1);
+ *op3 = apply_15_tap_filter_16_kernel(flat2, p7, p4, p3, q3, p3, &sum0, &sum1);
+ *op2 =
+ apply_15_tap_filter_16_kernel(flat2, p7, p3, p2, q4, *op2, &sum0, &sum1);
+ *op1 =
+ apply_15_tap_filter_16_kernel(flat2, p7, p2, p1, q5, *op1, &sum0, &sum1);
+ *op0 =
+ apply_15_tap_filter_16_kernel(flat2, p7, p1, p0, q6, *op0, &sum0, &sum1);
+ *oq0 =
+ apply_15_tap_filter_16_kernel(flat2, p7, p0, q0, q7, *oq0, &sum0, &sum1);
+ *oq1 =
+ apply_15_tap_filter_16_kernel(flat2, p6, q0, q1, q7, *oq1, &sum0, &sum1);
+ *oq2 =
+ apply_15_tap_filter_16_kernel(flat2, p5, q1, q2, q7, *oq2, &sum0, &sum1);
+ *oq3 = apply_15_tap_filter_16_kernel(flat2, p4, q2, q3, q7, q3, &sum0, &sum1);
+ *oq4 = apply_15_tap_filter_16_kernel(flat2, p3, q3, q4, q7, q4, &sum0, &sum1);
+ *oq5 = apply_15_tap_filter_16_kernel(flat2, p2, q4, q5, q7, q5, &sum0, &sum1);
+ *oq6 = apply_15_tap_filter_16_kernel(flat2, p1, q5, q6, q7, q6, &sum0, &sum1);
+}
+
+#define FUN_FILTER4(w, r) \
+ static INLINE void filter4_##w( \
+ const uint8x##w##_t mask, const uint8x##w##_t hev, \
+ const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+ const uint8x##w##_t q1, uint8x##w##_t *op1, uint8x##w##_t *op0, \
+ uint8x##w##_t *oq0, uint8x##w##_t *oq1) { \
+ int8x##w##_t filter, filter1, filter2, t; \
+ int8x##w##_t ps1 = flip_sign_##w(p1); \
+ int8x##w##_t ps0 = flip_sign_##w(p0); \
+ int8x##w##_t qs0 = flip_sign_##w(q0); \
+ int8x##w##_t qs1 = flip_sign_##w(q1); \
+ \
+ /* add outer taps if we have high edge variance */ \
+ filter = vqsub##r##s8(ps1, qs1); \
+ filter = vand##r##s8(filter, vreinterpret##r##s8_u8(hev)); \
+ t = vqsub##r##s8(qs0, ps0); \
+ \
+ /* inner taps */ \
+ filter = vqadd##r##s8(filter, t); \
+ filter = vqadd##r##s8(filter, t); \
+ filter = vqadd##r##s8(filter, t); \
+ filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask)); \
+ \
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */ \
+ /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \
+ /* we'd round it by 3 the other way */ \
+ filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3); \
+ filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3); \
+ \
+ qs0 = vqsub##r##s8(qs0, filter1); \
+ ps0 = vqadd##r##s8(ps0, filter2); \
+ *oq0 = flip_sign_back_##w(qs0); \
+ *op0 = flip_sign_back_##w(ps0); \
+ \
+ /* outer tap adjustments */ \
+ filter = vrshr##r##n_s8(filter1, 1); \
+ filter = vbic##r##s8(filter, vreinterpret##r##s8_u8(hev)); \
+ \
+ qs1 = vqsub##r##s8(qs1, filter); \
+ ps1 = vqadd##r##s8(ps1, filter); \
+ *oq1 = flip_sign_back_##w(qs1); \
+ *op1 = flip_sign_back_##w(ps1); \
+ }
+
+FUN_FILTER4(8, _) // filter4_8
+FUN_FILTER4(16, q_) // filter4_16
+#undef FUN_FILTER4
+
+#define FUN_FILTER8(w) \
+ static INLINE void filter8_##w( \
+ const uint8x##w##_t mask, const uint8x##w##_t flat, \
+ const uint32_t flat_status, const uint8x##w##_t hev, \
+ const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1, \
+ const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1, \
+ const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2, \
+ uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
+ uint8x##w##_t *oq1, uint8x##w##_t *oq2) { \
+ if (flat_status != (uint32_t)-2) { \
+ filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \
+ *op2 = p2; \
+ *oq2 = q2; \
+ if (flat_status) { \
+ apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
+ op0, oq0, oq1, oq2); \
+ } \
+ } else { \
+ calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, \
+ oq0, oq1, oq2); \
+ } \
+ }
+
+FUN_FILTER8(8) // filter8_8
+FUN_FILTER8(16) // filter8_16
+#undef FUN_FILTER8
+
+#define FUN_FILTER16(w) \
+ static INLINE void filter16_##w( \
+ const uint8x##w##_t mask, const uint8x##w##_t flat, \
+ const uint32_t flat_status, const uint8x##w##_t flat2, \
+ const uint32_t flat2_status, const uint8x##w##_t hev, \
+ const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \
+ const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
+ const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+ const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
+ const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
+ const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \
+ uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \
+ uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
+ uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \
+ uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6) { \
+ if (flat_status != (uint32_t)-2) { \
+ filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \
+ } \
+ \
+ if (flat_status) { \
+ *op2 = p2; \
+ *oq2 = q2; \
+ if (flat2_status != (uint32_t)-2) { \
+ apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
+ op0, oq0, oq1, oq2); \
+ } \
+ if (flat2_status) { \
+ apply_15_tap_filter_##w(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, \
+ q2, q3, q4, q5, q6, q7, op6, op5, op4, op3, \
+ op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \
+ oq6); \
+ } \
+ } \
+ }
+
+FUN_FILTER16(8) // filter16_8
+FUN_FILTER16(16) // filter16_16
+#undef FUN_FILTER16
+
+#define FUN_LOAD8(w, r) \
+ static INLINE void load_##w##x8( \
+ const uint8_t *s, const int p, uint8x##w##_t *p3, uint8x##w##_t *p2, \
+ uint8x##w##_t *p1, uint8x##w##_t *p0, uint8x##w##_t *q0, \
+ uint8x##w##_t *q1, uint8x##w##_t *q2, uint8x##w##_t *q3) { \
+ *p3 = vld1##r##u8(s); \
+ s += p; \
+ *p2 = vld1##r##u8(s); \
+ s += p; \
+ *p1 = vld1##r##u8(s); \
+ s += p; \
+ *p0 = vld1##r##u8(s); \
+ s += p; \
+ *q0 = vld1##r##u8(s); \
+ s += p; \
+ *q1 = vld1##r##u8(s); \
+ s += p; \
+ *q2 = vld1##r##u8(s); \
+ s += p; \
+ *q3 = vld1##r##u8(s); \
+ }
+
+FUN_LOAD8(8, _) // load_8x8
+FUN_LOAD8(16, q_) // load_16x8
+#undef FUN_LOAD8
+
+#define FUN_LOAD16(w, r) \
+ static INLINE void load_##w##x16( \
+ const uint8_t *s, const int p, uint8x##w##_t *s0, uint8x##w##_t *s1, \
+ uint8x##w##_t *s2, uint8x##w##_t *s3, uint8x##w##_t *s4, \
+ uint8x##w##_t *s5, uint8x##w##_t *s6, uint8x##w##_t *s7, \
+ uint8x##w##_t *s8, uint8x##w##_t *s9, uint8x##w##_t *s10, \
+ uint8x##w##_t *s11, uint8x##w##_t *s12, uint8x##w##_t *s13, \
+ uint8x##w##_t *s14, uint8x##w##_t *s15) { \
+ *s0 = vld1##r##u8(s); \
+ s += p; \
+ *s1 = vld1##r##u8(s); \
+ s += p; \
+ *s2 = vld1##r##u8(s); \
+ s += p; \
+ *s3 = vld1##r##u8(s); \
+ s += p; \
+ *s4 = vld1##r##u8(s); \
+ s += p; \
+ *s5 = vld1##r##u8(s); \
+ s += p; \
+ *s6 = vld1##r##u8(s); \
+ s += p; \
+ *s7 = vld1##r##u8(s); \
+ s += p; \
+ *s8 = vld1##r##u8(s); \
+ s += p; \
+ *s9 = vld1##r##u8(s); \
+ s += p; \
+ *s10 = vld1##r##u8(s); \
+ s += p; \
+ *s11 = vld1##r##u8(s); \
+ s += p; \
+ *s12 = vld1##r##u8(s); \
+ s += p; \
+ *s13 = vld1##r##u8(s); \
+ s += p; \
+ *s14 = vld1##r##u8(s); \
+ s += p; \
+ *s15 = vld1##r##u8(s); \
+ }
+
+FUN_LOAD16(8, _) // load_8x16
+FUN_LOAD16(16, q_) // load_16x16
+#undef FUN_LOAD16
+
+#define FUN_STORE4(w, r) \
+ static INLINE void store_##w##x4( \
+ uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+ const uint8x##w##_t s2, const uint8x##w##_t s3) { \
+ vst1##r##u8(s, s0); \
+ s += p; \
+ vst1##r##u8(s, s1); \
+ s += p; \
+ vst1##r##u8(s, s2); \
+ s += p; \
+ vst1##r##u8(s, s3); \
+ }
+
+FUN_STORE4(8, _) // store_8x4
+FUN_STORE4(16, q_) // store_16x4
+#undef FUN_STORE4
+
+#define FUN_STORE6(w, r) \
+ static INLINE void store_##w##x6( \
+ uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+ const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
+ const uint8x##w##_t s5) { \
+ vst1##r##u8(s, s0); \
+ s += p; \
+ vst1##r##u8(s, s1); \
+ s += p; \
+ vst1##r##u8(s, s2); \
+ s += p; \
+ vst1##r##u8(s, s3); \
+ s += p; \
+ vst1##r##u8(s, s4); \
+ s += p; \
+ vst1##r##u8(s, s5); \
+ }
+
+FUN_STORE6(8, _) // store_8x6
+FUN_STORE6(16, q_) // store_16x6
+#undef FUN_STORE6
+
+static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
+ const uint8x8_t p0, const uint8x8_t q0,
+ const uint8x8_t q1) {
+ uint8x8x4_t o;
+
+ o.val[0] = p1;
+ o.val[1] = p0;
+ o.val[2] = q0;
+ o.val[3] = q1;
+ vst4_lane_u8(s, o, 0);
+ s += p;
+ vst4_lane_u8(s, o, 1);
+ s += p;
+ vst4_lane_u8(s, o, 2);
+ s += p;
+ vst4_lane_u8(s, o, 3);
+ s += p;
+ vst4_lane_u8(s, o, 4);
+ s += p;
+ vst4_lane_u8(s, o, 5);
+ s += p;
+ vst4_lane_u8(s, o, 6);
+ s += p;
+ vst4_lane_u8(s, o, 7);
+}
+
+static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
+ const uint8x8_t s1, const uint8x8_t s2,
+ const uint8x8_t s3, const uint8x8_t s4,
+ const uint8x8_t s5) {
+ uint8x8x3_t o0, o1;
+
+ o0.val[0] = s0;
+ o0.val[1] = s1;
+ o0.val[2] = s2;
+ o1.val[0] = s3;
+ o1.val[1] = s4;
+ o1.val[2] = s5;
+ vst3_lane_u8(s - 3, o0, 0);
+ vst3_lane_u8(s + 0, o1, 0);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 1);
+ vst3_lane_u8(s + 0, o1, 1);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 2);
+ vst3_lane_u8(s + 0, o1, 2);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 3);
+ vst3_lane_u8(s + 0, o1, 3);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 4);
+ vst3_lane_u8(s + 0, o1, 4);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 5);
+ vst3_lane_u8(s + 0, o1, 5);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 6);
+ vst3_lane_u8(s + 0, o1, 6);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 7);
+ vst3_lane_u8(s + 0, o1, 7);
+}
+
+#define FUN_STORE8(w, r) \
+ static INLINE void store_##w##x8( \
+ uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+ const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
+ const uint8x##w##_t s5, const uint8x##w##_t s6, \
+ const uint8x##w##_t s7) { \
+ vst1##r##u8(s, s0); \
+ s += p; \
+ vst1##r##u8(s, s1); \
+ s += p; \
+ vst1##r##u8(s, s2); \
+ s += p; \
+ vst1##r##u8(s, s3); \
+ s += p; \
+ vst1##r##u8(s, s4); \
+ s += p; \
+ vst1##r##u8(s, s5); \
+ s += p; \
+ vst1##r##u8(s, s6); \
+ s += p; \
+ vst1##r##u8(s, s7); \
+ }
+
+FUN_STORE8(8, _) // store_8x8
+FUN_STORE8(16, q_) // store_16x8
+#undef FUN_STORE8
+
+#define FUN_STORE14(w, r) \
+ static INLINE void store_##w##x14( \
+ uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
+ const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
+ const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+ const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
+ const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
+ const uint32_t flat_status, const uint32_t flat2_status) { \
+ if (flat_status) { \
+ if (flat2_status) { \
+ vst1##r##u8(s - 7 * p, p6); \
+ vst1##r##u8(s - 6 * p, p5); \
+ vst1##r##u8(s - 5 * p, p4); \
+ vst1##r##u8(s - 4 * p, p3); \
+ vst1##r##u8(s + 3 * p, q3); \
+ vst1##r##u8(s + 4 * p, q4); \
+ vst1##r##u8(s + 5 * p, q5); \
+ vst1##r##u8(s + 6 * p, q6); \
+ } \
+ vst1##r##u8(s - 3 * p, p2); \
+ vst1##r##u8(s + 2 * p, q2); \
+ } \
+ vst1##r##u8(s - 2 * p, p1); \
+ vst1##r##u8(s - 1 * p, p0); \
+ vst1##r##u8(s + 0 * p, q0); \
+ vst1##r##u8(s + 1 * p, q1); \
+ }
+
+FUN_STORE14(8, _) // store_8x14
+FUN_STORE14(16, q_) // store_16x14
+#undef FUN_STORE14
+
+static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
+ const uint8x16_t s1, const uint8x16_t s2,
+ const uint8x16_t s3, const uint8x16_t s4,
+ const uint8x16_t s5, const uint8x16_t s6,
+ const uint8x16_t s7, const uint8x16_t s8,
+ const uint8x16_t s9, const uint8x16_t s10,
+ const uint8x16_t s11, const uint8x16_t s12,
+ const uint8x16_t s13, const uint8x16_t s14,
+ const uint8x16_t s15) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+ s += p;
+ vst1q_u8(s, s4);
+ s += p;
+ vst1q_u8(s, s5);
+ s += p;
+ vst1q_u8(s, s6);
+ s += p;
+ vst1q_u8(s, s7);
+ s += p;
+ vst1q_u8(s, s8);
+ s += p;
+ vst1q_u8(s, s9);
+ s += p;
+ vst1q_u8(s, s10);
+ s += p;
+ vst1q_u8(s, s11);
+ s += p;
+ vst1q_u8(s, s12);
+ s += p;
+ vst1q_u8(s, s13);
+ s += p;
+ vst1q_u8(s, s14);
+ s += p;
+ vst1q_u8(s, s15);
+}
+
+#define FUN_HOR_4_KERNEL(name, w) \
+ static INLINE void lpf_horizontal_4##name##kernel( \
+ uint8_t *s, const int p, const uint8x##w##_t blimit, \
+ const uint8x##w##_t limit, const uint8x##w##_t thresh) { \
+ uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev; \
+ \
+ load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); \
+ filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \
+ q3, &hev, &mask); \
+ filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); \
+ store_##w##x4(s - 2 * p, p, p1, p0, q0, q1); \
+ }
+
+FUN_HOR_4_KERNEL(_, 8) // lpf_horizontal_4_kernel
+FUN_HOR_4_KERNEL(_dual_, 16) // lpf_horizontal_4_dual_kernel
+#undef FUN_HOR_4_KERNEL
+
+void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
+}
+
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
+}
+
+void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ mask, hev;
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+ q2, q3, &hev, &mask);
+ filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
+ store_4x8(s - 2, p, p1, p0, q0, q1);
+}
+
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ mask, hev;
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ s15;
+
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
+ &s11, &s12, &s13, &s14, &s15);
+ transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+ q2, q3, &hev, &mask);
+ filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
+ s -= 2;
+ store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0),
+ vget_low_u8(q1));
+ store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0),
+ vget_high_u8(q1));
+}
+
+void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ // Note: transpose + store_8x8() is faster than store_6x8().
+ transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
+ store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
+}
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ s15;
+ uint32_t flat_status;
+
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
+ &s11, &s12, &s13, &s14, &s15);
+ transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ // Note: store_6x8() twice is faster than transpose + store_8x16().
+ store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
+ vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
+ store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
+ vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
+ vget_high_u8(oq2));
+}
+
+#define FUN_LPF_16_KERNEL(name, w) \
+ static INLINE void lpf_16##name##kernel( \
+ const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \
+ const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \
+ const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
+ const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+ const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
+ const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
+ const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \
+ uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \
+ uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
+ uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \
+ uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6, \
+ uint32_t *flat_status, uint32_t *flat2_status) { \
+ uint8x##w##_t blimit_vec, limit_vec, thresh_vec, mask, flat, flat2, hev; \
+ \
+ load_thresh_##w(blimit, limit, thresh, &blimit_vec, &limit_vec, \
+ &thresh_vec); \
+ mask = filter_flat_hev_mask_##w(limit_vec, blimit_vec, thresh_vec, p3, p2, \
+ p1, p0, q0, q1, q2, q3, &flat, \
+ flat_status, &hev); \
+ flat2 = flat_mask5_##w(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, \
+ flat2_status); \
+ filter16_##w(mask, flat, *flat_status, flat2, *flat2_status, hev, p7, p6, \
+ p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, \
+ op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \
+ oq6); \
+ }
+
+FUN_LPF_16_KERNEL(_, 8) // lpf_16_kernel
+FUN_LPF_16_KERNEL(_dual_, 16) // lpf_16_dual_kernel
+#undef FUN_LPF_16_KERNEL
+
+// Quiet warnings of the form: 'vpx_dsp/arm/loopfilter_neon.c|981 col 42|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
+void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
+ op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+ uint32_t flat_status, flat2_status;
+
+ load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
+ &q3, &q4, &q5, &q6, &q7);
+ lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
+ q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
+ &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
+ &flat2_status);
+ store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
+ oq5, oq6, flat_status, flat2_status);
+}
+
+void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
+ op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+ uint32_t flat_status, flat2_status;
+
+ load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ p7 = vld1q_u8(s - 8 * p);
+ p6 = vld1q_u8(s - 7 * p);
+ p5 = vld1q_u8(s - 6 * p);
+ p4 = vld1q_u8(s - 5 * p);
+ q4 = vld1q_u8(s + 4 * p);
+ q5 = vld1q_u8(s + 5 * p);
+ q6 = vld1q_u8(s + 6 * p);
+ q7 = vld1q_u8(s + 7 * p);
+ lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
+ q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+ &flat_status, &flat2_status);
+ store_16x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
+ oq5, oq6, flat_status, flat2_status);
+}
+
+void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
+ op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+ uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint32_t flat_status, flat2_status;
+
+ s -= 8;
+ load_16x8(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_u8_16x8(s0, s1, s2, s3, s4, s5, s6, s7, &p7, &p6, &p5, &p4, &p3,
+ &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
+ q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
+ &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
+ &flat2_status);
+ if (flat_status) {
+ if (flat2_status) {
+ transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
+ oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
+ &s6, &s7);
+ store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
+ } else {
+ // Note: transpose + store_8x8() is faster than store_6x8().
+ transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
+ store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
+ }
+ } else {
+ store_4x8(s + 6, p, op1, op0, oq0, oq1);
+ }
+}
+
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
+ op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+ uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ s15;
+ uint32_t flat_status, flat2_status;
+
+ s -= 8;
+ load_16x16(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, &s11,
+ &s12, &s13, &s14, &s15);
+ transpose_u8_16x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, s15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1,
+ &q2, &q3, &q4, &q5, &q6, &q7);
+ lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
+ q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+ &flat_status, &flat2_status);
+ if (flat_status) {
+ if (flat2_status) {
+ transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
+ oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
+ &s6, &s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14,
+ &s15);
+ store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, s14, s15);
+ } else {
+ // Note: store_6x8() twice is faster than transpose + store_8x16().
+ s += 8;
+ store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
+ vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
+ store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
+ vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
+ vget_high_u8(oq2));
+ }
+ } else {
+ s += 6;
+ store_4x8(s, p, vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0),
+ vget_low_u8(oq1));
+ store_4x8(s + 8 * p, p, vget_high_u8(op1), vget_high_u8(op0),
+ vget_high_u8(oq0), vget_high_u8(oq1));
+ }
+}
+
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
new file mode 100644
index 0000000000..1a20da70ef
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_MEM_NEON_H_
+#define VPX_VPX_DSP_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Support for these xN intrinsics is lacking in older versions of GCC.
+#if defined(__GNUC__) && !defined(__clang__)
+#if __GNUC__ < 8 || defined(__arm__)
+static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
+ uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+ return res;
+}
+#endif
+
+#if __GNUC__ < 9 || defined(__arm__)
+static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) {
+ uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+ vld1q_u8(ptr + 2 * 16) } };
+ return res;
+}
+#endif
+#endif
+
+static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
+ const int16_t c2, const int16_t c3) {
+ return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) |
+ ((int64_t)(uint16_t)c2 << 32) | ((int64_t)c3 << 48));
+}
+
+static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) {
+ return vcreate_s32((uint32_t)c0 | ((int64_t)(uint32_t)c1 << 32));
+}
+
+static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1,
+ const int32_t c2, const int32_t c3) {
+ return vcombine_s32(create_s32x2_neon(c0, c1), create_s32x2_neon(c2, c3));
+}
+
+// Helper functions used to load tran_low_t into int16, narrowing if necessary.
+static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4x2_t v0 = vld2q_s32(buf);
+ const int32x4x2_t v1 = vld2q_s32(buf + 8);
+ const int16x4_t s0 = vmovn_s32(v0.val[0]);
+ const int16x4_t s1 = vmovn_s32(v0.val[1]);
+ const int16x4_t s2 = vmovn_s32(v1.val[0]);
+ const int16x4_t s3 = vmovn_s32(v1.val[1]);
+ int16x8x2_t res;
+ res.val[0] = vcombine_s16(s0, s2);
+ res.val[1] = vcombine_s16(s1, s3);
+ return res;
+#else
+ return vld2q_s16(buf);
+#endif
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vld1q_s32(buf);
+ const int32x4_t v1 = vld1q_s32(buf + 4);
+ const int16x4_t s0 = vmovn_s32(v0);
+ const int16x4_t s1 = vmovn_s32(v1);
+ return vcombine_s16(s0, s1);
+#else
+ return vld1q_s16(buf);
+#endif
+}
+
+static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vld1q_s32(buf);
+ return vmovn_s32(v0);
+#else
+ return vld1_s16(buf);
+#endif
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+ const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+ vst1q_s32(buf, v0);
+ vst1q_s32(buf + 4, v1);
+#else
+ vst1q_s16(buf, a);
+#endif
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) {
+ vst1q_s32(buf, a);
+}
+
+static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) {
+ return vld1q_s32(buf);
+}
+#endif
+
+// Propagate type information to the compiler. Without this the compiler may
+// assume the required alignment of uint32_t (4 bytes) and add alignment hints
+// to the memory access.
+//
+// This is used for functions operating on uint8_t which wish to load or store 4
+// values at a time but which may not be on 4 byte boundaries.
+static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
+ memcpy(buf, &a, 4);
+}
+
+// Load 4 contiguous bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
+ uint32_t a;
+ uint32x2_t a_u32;
+ memcpy(&a, buf, 4);
+ a_u32 = vdup_n_u32(0);
+ a_u32 = vset_lane_u32(a, a_u32, 0);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+// Load 4 contiguous bytes and replicate across a vector when alignment is not
+// guaranteed.
+static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) {
+ uint32_t a;
+ memcpy(&a, buf, 4);
+ return vreinterpret_u8_u32(vdup_n_u32(a));
+}
+
+// Store 4 contiguous bytes from the low half of an 8x8 vector.
+static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) {
+ vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0);
+}
+
+// Store 4 contiguous bytes from the high half of an 8x8 vector.
+static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
+ vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1);
+}
+
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
+ ptrdiff_t stride) {
+ uint32_t a;
+ uint32x2_t a_u32;
+ if (stride == 4) return vld1_u8(buf);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdup_n_u32(a);
+ memcpy(&a, buf, 4);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+// Load 8 bytes when alignment is not guaranteed.
+static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
+ uint64_t a;
+ uint64x1_t a_u64 = vdup_n_u64(0);
+ memcpy(&a, buf, 8);
+ a_u64 = vset_lane_u64(a, a_u64, 0);
+ return vreinterpret_u16_u64(a_u64);
+}
+
+// Load 2 sets of 8 bytes when alignment is not guaranteed.
+static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
+ ptrdiff_t stride) {
+ uint64_t a;
+ uint64x2_t a_u64;
+ if (stride == 4) return vld1q_u16(buf);
+ memcpy(&a, buf, 8);
+ buf += stride;
+ a_u64 = vdupq_n_u64(a);
+ memcpy(&a, buf, 8);
+ a_u64 = vsetq_lane_u64(a, a_u64, 1);
+ return vreinterpretq_u16_u64(a_u64);
+}
+
+// Store 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
+ const uint8x8_t a) {
+ const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+ if (stride == 4) {
+ vst1_u8(buf, a);
+ return;
+ }
+ uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
+ buf += stride;
+ uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
+}
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
+ ptrdiff_t stride) {
+ uint32_t a;
+ uint32x4_t a_u32;
+ if (stride == 4) return vld1q_u8(buf);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdupq_n_u32(a);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 1);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 2);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 3);
+ return vreinterpretq_u8_u32(a_u32);
+}
+
+// Store 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
+ const uint8x16_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
+ if (stride == 4) {
+ vst1q_u8(buf, a);
+ return;
+ }
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
+ buf += stride;
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
+ buf += stride;
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2));
+ buf += stride;
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3));
+}
+
+// Load 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) {
+ uint32x2_t a = vdup_n_u32(0);
+
+ assert(!((intptr_t)buf % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ a = vld1_lane_u32((const uint32_t *)buf, a, 0);
+ buf += stride;
+ a = vld1_lane_u32((const uint32_t *)buf, a, 1);
+ return vreinterpret_u8_u32(a);
+}
+
+// Store 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
+ uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+
+ assert(!((intptr_t)buf % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ vst1_lane_u32((uint32_t *)buf, a_u32, 0);
+ buf += stride;
+ vst1_lane_u32((uint32_t *)buf, a_u32, 1);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
+ const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2, const uint8x8_t s3) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p,
+ const uint8x16_t s0, const uint8x16_t s1,
+ const uint8x16_t s2, const uint8x16_t s3) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+}
+
+static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6, uint8x8_t *const s7) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+ s += p;
+ *s7 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+ const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2, const uint8x8_t s3,
+ const uint8x8_t s4, const uint8x8_t s5,
+ const uint8x8_t s6, const uint8x8_t s7) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+ s += p;
+ vst1_u8(s, s4);
+ s += p;
+ vst1_u8(s, s5);
+ s += p;
+ vst1_u8(s, s6);
+ s += p;
+ vst1_u8(s, s7);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3,
+ uint8x16_t *const s4, uint8x16_t *const s5,
+ uint8x16_t *const s6, uint8x16_t *const s7) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+ s += p;
+ *s4 = vld1q_u8(s);
+ s += p;
+ *s5 = vld1q_u8(s);
+ s += p;
+ *s6 = vld1q_u8(s);
+ s += p;
+ *s7 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
+ const uint8x16_t s0, const uint8x16_t s1,
+ const uint8x16_t s2, const uint8x16_t s3,
+ const uint8x16_t s4, const uint8x16_t s5,
+ const uint8x16_t s6, const uint8x16_t s7) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+ s += p;
+ vst1q_u8(s, s4);
+ s += p;
+ vst1q_u8(s, s5);
+ s += p;
+ vst1q_u8(s, s6);
+ s += p;
+ vst1q_u8(s, s7);
+}
+
+static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+ uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
+ uint16x8_t *s6, uint16x8_t *s7) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+ *s5 = vld1q_u16(s);
+ s += p;
+ *s6 = vld1q_u16(s);
+ s += p;
+ *s7 = vld1q_u16(s);
+}
+
+#endif // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c
new file mode 100644
index 0000000000..5a76065549
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
+ const int16x8_t dequant,
+ tran_low_t *dqcoeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t dqcoeff_0 =
+ vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+ const int32x4_t dqcoeff_1 =
+ vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+#else
+ vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE int16x8_t
+quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+ const int16x8_t round, const int16x8_t quant,
+ const int16x8_t quant_shift, const int16x8_t dequant) {
+ // Load coeffs as 8 x 16-bit ints, take sign and abs values
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ // Calculate mask of elements outside the bin
+ const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ // Get the rounded values
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+ qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ // Only keep the relevant coeffs
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+ calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
+
+ return qcoeff;
+}
+
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+
+ // Only the first element of each vector is DC.
+ int16x8_t zbin = vld1q_s16(zbin_ptr);
+ int16x8_t round = vld1q_s16(round_ptr);
+ int16x8_t quant = vld1q_s16(quant_ptr);
+ int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+ int16x8_t dequant = vld1q_s16(dequant_ptr);
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
+ quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ n_coeffs -= 8;
+
+ {
+ zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+ round = vdupq_lane_s16(vget_low_s16(round), 1);
+ quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+ quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+ dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
+
+ do {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ n_coeffs -= 8;
+ } while (n_coeffs > 0);
+ }
+
+#if VPX_ARCH_AARCH64
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // VPX_ARCH_AARCH64
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)scan;
+}
+
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
+ const int16x8_t dequant,
+ tran_low_t *dqcoeff_ptr) {
+ int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+ int32x4_t dqcoeff_1 =
+ vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+ dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+#else
+ vst1q_s16(dqcoeff_ptr,
+ vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE int16x8_t
+quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+ const int16x8_t round, const int16x8_t quant,
+ const int16x8_t quant_shift, const int16x8_t dequant) {
+ // Load coeffs as 8 x 16-bit ints, take sign and abs values
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ // Calculate mask of elements outside the bin
+ const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ // Get the rounded values
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+ qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ // Only keep the relevant coeffs
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
+
+ return qcoeff;
+}
+
+// Main difference is that zbin values are halved before comparison and dqcoeff
+// values are divided by 2. zbin is rounded but dqcoeff is not.
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *scan_order) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+ int i;
+ const int16_t *iscan = scan_order->iscan;
+
+ // Only the first element of each vector is DC.
+ int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
+ int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
+ int16x8_t quant = vld1q_s16(mb_plane->quant);
+ int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
+ int16x8_t dequant = vld1q_s16(dequant_ptr);
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ {
+ zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+ round = vdupq_lane_s16(vget_low_s16(round), 1);
+ quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+ quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+ dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
+
+ for (i = 1; i < 32 * 32 / 8; ++i) {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+ }
+
+#if VPX_ARCH_AARCH64
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // VPX_ARCH_AARCH64
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c
new file mode 100644
index 0000000000..3a548d0f9f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+ uint32x4_t *const sad_sum) {
+ uint8x16_t abs_diff = vabdq_u8(src, ref);
+ *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
+}
+
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ uint8x16_t s0, s1, s2, s3;
+
+ s0 = vld1q_u8(src + i * src_stride);
+ sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+ s1 = vld1q_u8(src + i * src_stride + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+ s2 = vld1q_u8(src + i * src_stride + 32);
+ sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+ sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+ sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+ sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+ s3 = vld1q_u8(src + i * src_stride + 48);
+ sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+ sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+ sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+ sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+ i++;
+ } while (i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ uint8x16_t s0, s1;
+
+ s0 = vld1q_u8(src + i * src_stride);
+ sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+ s1 = vld1q_u8(src + i * src_stride + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+ i++;
+ } while (i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src + i * src_stride);
+ sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+ sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+
+ i++;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD))
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+ uint16x8_t *const sad_sum) {
+ uint8x16_t abs_diff = vabdq_u8(src, ref);
+ *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
+}
+
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ uint8x16_t s0, s1, s2, s3;
+
+ s0 = vld1q_u8(src + i * src_stride);
+ sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+ s1 = vld1q_u8(src + i * src_stride + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+ s2 = vld1q_u8(src + i * src_stride + 32);
+ sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+ sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+ sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+ sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+ s3 = vld1q_u8(src + i * src_stride + 48);
+ sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+ sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+ sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+ sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+ i++;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
+}
+
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ uint8x16_t s0, s1;
+
+ s0 = vld1q_u8(src + i * src_stride);
+ sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+ s1 = vld1q_u8(src + i * src_stride + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+ i++;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
+}
+
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src + i * src_stride);
+ sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+ sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+
+ i++;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
+ uint16x8_t *const sad_sum) {
+ uint8x8_t abs_diff = vabd_u8(src, ref);
+ *sad_sum = vaddw_u8(*sad_sum, abs_diff);
+}
+
+static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ const uint8x8_t s = vld1_u8(src + i * src_stride);
+ sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
+ sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
+ sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
+ sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);
+
+ i++;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
+}
+
+static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
+ uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
+
+ sad8_neon(s, r0, &sum[0]);
+ sad8_neon(s, r1, &sum[1]);
+ sad8_neon(s, r2, &sum[2]);
+ sad8_neon(s, r3, &sum[3]);
+
+ i += 2;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
+}
+
+#define SAD_WXH_4D_NEON(w, h) \
+ void vpx_sad##w##x##h##x4d_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad##w##xhx4d_neon(src_ptr, src_stride, ref_array, ref_stride, sad_array, \
+ (h)); \
+ }
+
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
+
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
+
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
+
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
+
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+
+#undef SAD_WXH_4D_NEON
+
+#define SAD_SKIP_WXH_4D_NEON(w, h) \
+ void vpx_sad_skip_##w##x##h##x4d_neon( \
+ const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], int ref_stride, \
+ uint32_t sad_array[4]) { \
+ sad##w##xhx4d_neon(src_ptr, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, ((h) >> 1)); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+SAD_SKIP_WXH_4D_NEON(4, 4)
+SAD_SKIP_WXH_4D_NEON(4, 8)
+
+SAD_SKIP_WXH_4D_NEON(8, 4)
+SAD_SKIP_WXH_4D_NEON(8, 8)
+SAD_SKIP_WXH_4D_NEON(8, 16)
+
+SAD_SKIP_WXH_4D_NEON(16, 8)
+SAD_SKIP_WXH_4D_NEON(16, 16)
+SAD_SKIP_WXH_4D_NEON(16, 32)
+
+SAD_SKIP_WXH_4D_NEON(32, 16)
+SAD_SKIP_WXH_4D_NEON(32, 32)
+SAD_SKIP_WXH_4D_NEON(32, 64)
+
+SAD_SKIP_WXH_4D_NEON(64, 32)
+SAD_SKIP_WXH_4D_NEON(64, 64)
+
+#undef SAD_SKIP_WXH_4D_NEON
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c
new file mode 100644
index 0000000000..566a1f81db
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c
@@ -0,0 +1,570 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int w, int h) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32;
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
+ uint8x16_t diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ diff2 = vabdq_u8(s2, r2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ diff3 = vabdq_u8(s3, r3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_uint32x4(sum_u32);
+}
+
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t diff0 = vabdq_u8(s0, r0);
+ uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t diff1 = vabdq_u8(s1, r1);
+ uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+ sum = vpadalq_u16(sum, sum0);
+ sum = vpadalq_u16(sum, sum1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(sum);
+}
+
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+
+ sum = vabal_u8(sum, s, r);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+
+ sum = vabal_u8(sum, s, r);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+#define SAD_WXH_NEON(w, h) \
+ unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
+ }
+
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
+
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
+
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
+
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
+
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h) \
+ unsigned int vpx_sad_skip_##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * \
+ sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
+ }
+
+SAD_SKIP_WXH_NEON(4, 4)
+SAD_SKIP_WXH_NEON(4, 8)
+
+SAD_SKIP_WXH_NEON(8, 4)
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
+
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+
+#undef SAD_SKIP_WXH_NEON
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ second_pred += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+ second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+ second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ p1 = vld1q_u8(second_pred);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32;
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+ uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ p2 = vld1q_u8(second_pred + 32);
+ avg2 = vrhaddq_u8(r2, p2);
+ diff2 = vabdq_u8(s2, avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ p3 = vld1q_u8(second_pred + 48);
+ avg3 = vrhaddq_u8(r3, p3);
+ diff3 = vabdq_u8(s3, avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--i != 0);
+
+ sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_uint32x4(sum_u32);
+}
+
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t avg0 = vrhaddq_u8(r0, p0);
+ uint8x16_t diff0 = vabdq_u8(s0, avg0);
+ uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t avg1 = vrhaddq_u8(r1, p1);
+ uint8x16_t diff1 = vabdq_u8(s1, avg1);
+ uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+ sum = vpadalq_u16(sum, sum0);
+ sum = vpadalq_u16(sum, sum1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(sum);
+}
+
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = vrhaddq_u8(r, p);
+ uint8x16_t diff = vabdq_u8(s, avg);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t avg = vrhadd_u8(r, p);
+ sum = vabal_u8(sum, s, avg);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t avg = vrhadd_u8(r, p);
+ sum = vabal_u8(sum, s, avg);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+#define SAD_WXH_AVG_NEON(w, h) \
+ uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+SAD_WXH_AVG_NEON(4, 4)
+SAD_WXH_AVG_NEON(4, 8)
+
+SAD_WXH_AVG_NEON(8, 4)
+SAD_WXH_AVG_NEON(8, 8)
+SAD_WXH_AVG_NEON(8, 16)
+
+SAD_WXH_AVG_NEON(16, 8)
+SAD_WXH_AVG_NEON(16, 16)
+SAD_WXH_AVG_NEON(16, 32)
+
+SAD_WXH_AVG_NEON(32, 16)
+SAD_WXH_AVG_NEON(32, 32)
+SAD_WXH_AVG_NEON(32, 64)
+
+SAD_WXH_AVG_NEON(64, 32)
+SAD_WXH_AVG_NEON(64, 64)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm
new file mode 100644
index 0000000000..9811cd5a5a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm
@@ -0,0 +1,34 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vpx_push_neon|
+ EXPORT |vpx_pop_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_push_neon| PROC
+ vstm r0!, {d8-d15}
+ bx lr
+
+ ENDP
+
+|vpx_pop_neon| PROC
+ vldm r0!, {d8-d15}
+ bx lr
+
+ ENDP
+
+ END
+
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c
new file mode 100644
index 0000000000..9328c3ed89
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#include "vpx_dsp/variance.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+// Process a block exactly 8 wide and any height.
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ } while (--i != 0);
+}
+
+// Process a block which is a mutiple of 16 wide and any height.
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l =
+ vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+ uint16x8_t blend_h =
+ vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+ uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3);
+ uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3);
+ vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi));
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+ dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+ dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
+ sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
+ yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ second_pred += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ second_pred += 8;
+ } while (--i > 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l =
+ vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+ uint16x8_t blend_h =
+ vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height,
+ const uint8_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ avg = vrhaddq_u8(avg, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of vpx_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+ int dst_width, int dst_height,
+ const uint8_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr + j);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = vrhaddq_u8(s, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+ xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ if (xoffset == 0) { \
+ uint8_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ avg_pred(src, tmp, source_stride, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
+ source_stride, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \
+ xoffset, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c
new file mode 100644
index 0000000000..2c008e48ab
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
+ int r = rows, c;
+
+ if (cols > 16) {
+ do {
+ for (c = 0; c < cols; c += 32) {
+ const uint8x16_t s0 = vld1q_u8(&src[c + 0]);
+ const uint8x16_t s1 = vld1q_u8(&src[c + 16]);
+ const uint8x16_t p0 = vld1q_u8(&pred[c + 0]);
+ const uint8x16_t p1 = vld1q_u8(&pred[c + 16]);
+ const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0));
+ const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0));
+ const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1));
+ const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1));
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1));
+ vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2));
+ vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3));
+ }
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols > 8) {
+ do {
+ const uint8x16_t s = vld1q_u8(&src[0]);
+ const uint8x16_t p = vld1q_u8(&pred[0]);
+ const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p));
+ const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p));
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0));
+ vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols > 4) {
+ do {
+ const uint8x8_t s = vld1_u8(&src[0]);
+ const uint8x8_t p = vld1_u8(&pred[0]);
+ const uint16x8_t v_diff = vsubl_u8(s, p);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else {
+ assert(cols == 4);
+ do {
+ const uint8x8_t s = load_unaligned_u8(src, (int)src_stride);
+ const uint8x8_t p = load_unaligned_u8(pred, (int)pred_stride);
+ const uint16x8_t d = vsubl_u8(s, p);
+ vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d)));
+ vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d)));
+ diff += 2 * diff_stride;
+ pred += 2 * pred_stride;
+ src += 2 * src_stride;
+ r -= 2;
+ } while (r);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride,
+ const uint8_t *src8_ptr,
+ ptrdiff_t src_stride,
+ const uint8_t *pred8_ptr,
+ ptrdiff_t pred_stride, int bd) {
+ int r = rows, c;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
+ (void)bd;
+
+ if (cols >= 16) {
+ do {
+ for (c = 0; c < cols; c += 16) {
+ const uint16x8_t s0 = vld1q_u16(&src[c + 0]);
+ const uint16x8_t s1 = vld1q_u16(&src[c + 8]);
+ const uint16x8_t p0 = vld1q_u16(&pred[c + 0]);
+ const uint16x8_t p1 = vld1q_u16(&pred[c + 8]);
+ const uint16x8_t d0 = vsubq_u16(s0, p0);
+ const uint16x8_t d1 = vsubq_u16(s1, p1);
+ vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0));
+ vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols >= 8) {
+ do {
+ for (c = 0; c < cols; c += 8) {
+ const uint16x8_t s = vld1q_u16(&src[c]);
+ const uint16x8_t p = vld1q_u16(&pred[c]);
+ const uint16x8_t d0 = vsubq_u16(s, p);
+ vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols >= 4) {
+ do {
+ for (c = 0; c < cols; c += 4) {
+ const uint16x4_t s = vld1_u16(&src[c]);
+ const uint16x4_t p = vld1_u16(&pred[c]);
+ const uint16x4_t v_diff = vsub_u16(s, p);
+ vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h
new file mode 100644
index 0000000000..48a2fc05ca
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_SUM_NEON_H_
+#define VPX_VPX_DSP_ARM_SUM_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlv_u8(a);
+#else
+ const uint16x4_t b = vpaddl_u8(a);
+ const uint16x4_t c = vpadd_u16(b, b);
+ return vget_lane_u16(c, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlv_u8(a);
+#else
+ const uint16x4_t b = vpaddl_u8(a);
+ const uint16x4_t c = vpadd_u16(b, b);
+ const uint16x4_t d = vpadd_u16(c, c);
+ return vget_lane_u16(d, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlvq_u8(a);
+#else
+ const uint16x8_t b = vpaddlq_u8(a);
+ const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b));
+ const uint16x4_t d = vpadd_u16(c, c);
+ const uint16x4_t e = vpadd_u16(d, d);
+ return vget_lane_u16(e, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddv_u16(a);
+#else
+ const uint16x4_t b = vpadd_u16(a, a);
+ const uint16x4_t c = vpadd_u16(b, b);
+ return vget_lane_u16(c, 0);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlvq_s16(a);
+#else
+ const int32x4_t b = vpaddlq_s16(a);
+ const int64x2_t c = vpaddlq_s32(b);
+ const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+ vreinterpret_s32_s64(vget_high_s64(c)));
+ return vget_lane_s32(d, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlvq_u16(a);
+#else
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+ vreinterpret_u32_u64(vget_high_u64(c)));
+ return vget_lane_u32(d, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
+#if VPX_ARCH_AARCH64
+ const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+ const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+ const uint16x8_t b0 = vpaddq_u16(a0, a1);
+ return vpaddlq_u16(b0);
+#else
+ const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+ const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+ const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+ const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+ const uint16x4_t b0 = vpadd_u16(a0, a1);
+ const uint16x4_t b1 = vpadd_u16(a2, a3);
+ return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
+static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
+ const uint16x8_t vec_hi) {
+#if VPX_ARCH_AARCH64
+ return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+ const uint32x4_t vec_l_lo =
+ vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi =
+ vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+ const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
+ const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
+ const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
+ const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
+ const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
+ const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
+ const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
+ const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
+ const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
+ const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
+#if VPX_ARCH_AARCH64
+ const uint32x4_t c0 = vpaddq_u32(b0, b1);
+ const uint32x4_t c1 = vpaddq_u32(b2, b3);
+ return vpaddq_u32(c0, c1);
+#else
+ const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+ const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+ const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+ const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+ const uint32x2_t d0 = vpadd_u32(c0, c1);
+ const uint32x2_t d1 = vpadd_u32(c2, c3);
+ return vcombine_u32(d0, d1);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddv_s32(a);
+#else
+ return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddv_u32(a);
+#else
+ return vget_lane_u32(a, 0) + vget_lane_u32(a, 1);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddvq_s32(a);
+#else
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
+#if VPX_ARCH_AARCH64
+ uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
+ uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
+ return vpaddq_u32(res01, res23);
+#else
+ uint32x4_t res = vdupq_n_u32(0);
+ res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
+ res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
+ res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
+ res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
+ return res;
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
+#endif
+}
+
+static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddvq_s64(a);
+#else
+ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddvq_u64(a);
+#else
+ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+#endif
+}
+
+#endif // VPX_VPX_DSP_ARM_SUM_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c
new file mode 100644
index 0000000000..074afe3258
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) {
+ if (size == 4) {
+ int16x4_t s[4];
+ int32x4_t sum_s32;
+
+ s[0] = vld1_s16(src + 0 * stride);
+ s[1] = vld1_s16(src + 1 * stride);
+ s[2] = vld1_s16(src + 2 * stride);
+ s[3] = vld1_s16(src + 3 * stride);
+
+ sum_s32 = vmull_s16(s[0], s[0]);
+ sum_s32 = vmlal_s16(sum_s32, s[1], s[1]);
+ sum_s32 = vmlal_s16(sum_s32, s[2], s[2]);
+ sum_s32 = vmlal_s16(sum_s32, s[3], s[3]);
+
+ return horizontal_long_add_uint32x4(vreinterpretq_u32_s32(sum_s32));
+ } else {
+ uint64x2_t sum_u64 = vdupq_n_u64(0);
+ int rows = size;
+
+ do {
+ const int16_t *src_ptr = src;
+ int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int cols = size;
+
+ do {
+ int16x8_t s[8];
+
+ s[0] = vld1q_s16(src_ptr + 0 * stride);
+ s[1] = vld1q_s16(src_ptr + 1 * stride);
+ s[2] = vld1q_s16(src_ptr + 2 * stride);
+ s[3] = vld1q_s16(src_ptr + 3 * stride);
+ s[4] = vld1q_s16(src_ptr + 4 * stride);
+ s[5] = vld1q_s16(src_ptr + 5 * stride);
+ s[6] = vld1q_s16(src_ptr + 6 * stride);
+ s[7] = vld1q_s16(src_ptr + 7 * stride);
+
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[0]), vget_low_s16(s[0]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[1]), vget_low_s16(s[1]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[2]), vget_low_s16(s[2]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[3]), vget_low_s16(s[3]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[4]), vget_low_s16(s[4]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[5]), vget_low_s16(s[5]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[6]), vget_low_s16(s[6]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[7]), vget_low_s16(s[7]));
+
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[0]), vget_high_s16(s[0]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[1]), vget_high_s16(s[1]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[2]), vget_high_s16(s[2]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[3]), vget_high_s16(s[3]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[4]), vget_high_s16(s[4]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[5]), vget_high_s16(s[5]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[6]), vget_high_s16(s[6]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[7]), vget_high_s16(s[7]));
+
+ src_ptr += 8;
+ cols -= 8;
+ } while (cols);
+
+ sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[0]));
+ sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[1]));
+ src += 8 * stride;
+ rows -= 8;
+ } while (rows);
+
+ return horizontal_add_uint64x2(sum_u64);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
new file mode 100644
index 0000000000..74f85a6bb6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -0,0 +1,1546 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
+#define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+// Transpose 64 bit elements as follows:
+// a0: 00 01 02 03 04 05 06 07
+// a1: 16 17 18 19 20 21 22 23
+//
+// b0.val[0]: 00 01 02 03 16 17 18 19
+// b0.val[1]: 04 05 06 07 20 21 22 23
+static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+ int16x8x2_t b0;
+#if VPX_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_s16_s64(
+ vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+ b0.val[1] = vreinterpretq_s16_s64(
+ vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
+ b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+ vreinterpret_s16_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+ vreinterpret_s16_s32(vget_high_s32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+ int32x4x2_t b0;
+#if VPX_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_s32_s64(
+ vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+ b0.val[1] = vreinterpretq_s32_s64(
+ vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
+ b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+ b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+#endif
+ return b0;
+}
+
+static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
+ int64x2x2_t b0;
+#if VPX_ARCH_AARCH64
+ b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+ b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+#else
+ b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
+ vreinterpret_s64_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
+ vreinterpret_s64_s32(vget_high_s32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
+ uint8x16x2_t b0;
+#if VPX_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_u8_u64(
+ vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+ b0.val[1] = vreinterpretq_u8_u64(
+ vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
+ b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
+ vreinterpret_u8_u32(vget_low_u32(a1)));
+ b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
+ vreinterpret_u8_u32(vget_high_u32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
+ uint16x8x2_t b0;
+#if VPX_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_u16_u64(
+ vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+ b0.val[1] = vreinterpretq_u16_u64(
+ vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
+ b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+ vreinterpret_u16_u32(vget_low_u32(a1)));
+ b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+ vreinterpret_u16_u32(vget_high_u32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 10 11 12 13
+ // a1: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const uint16x4x2_t b0 =
+ vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 02 03 22 23
+ // c0.val[1]: 10 11 30 31 12 13 32 33
+
+ const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+ vreinterpret_u32_u16(b0.val[1]));
+
+ // Swap 8 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 02 12 22 32
+ // d0.val[1]: 01 11 21 31 03 13 23 33
+
+ const uint8x8x2_t d0 =
+ vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+}
+
+static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
+ int16x4_t *a2, int16x4_t *a3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+ const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+ vreinterpret_s32_s16(b1.val[0]));
+ const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+ vreinterpret_s32_s16(b1.val[1]));
+
+ *a0 = vreinterpret_s16_s32(c0.val[0]);
+ *a1 = vreinterpret_s16_s32(c1.val[0]);
+ *a2 = vreinterpret_s16_s32(c0.val[1]);
+ *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 10 11 12 13
+ // a1: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const int32x4x2_t b0 =
+ vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1));
+
+ // Swap 64 bit elements resulting in:
+ // c0: 00 01 20 21 02 03 22 23
+ // c1: 10 11 30 31 12 13 32 33
+
+ const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]);
+
+ // Swap 16 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 02 12 22 32
+ // d0.val[1]: 01 11 21 31 03 13 23 33
+
+ const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+}
+
+static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 10 11 12 13
+ // a1: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const uint32x4x2_t b0 =
+ vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1));
+
+ // Swap 64 bit elements resulting in:
+ // c0: 00 01 20 21 02 03 22 23
+ // c1: 10 11 30 31 12 13 32 33
+
+ const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]);
+
+ // Swap 16 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 02 12 22 32
+ // d0.val[1]: 01 11 21 31 03 13 23 33
+
+ const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+}
+
+static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3, const uint8x8_t a4,
+ const uint8x8_t a5, const uint8x8_t a6,
+ const uint8x8_t a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 XX XX XX XX
+ // a1: 10 11 12 13 XX XX XX XX
+ // a2: 20 21 22 23 XX XX XX XX
+ // a3; 30 31 32 33 XX XX XX XX
+ // a4: 40 41 42 43 XX XX XX XX
+ // a5: 50 51 52 53 XX XX XX XX
+ // a6: 60 61 62 63 XX XX XX XX
+ // a7: 70 71 72 73 XX XX XX XX
+ // to:
+ // b0.val[0]: 00 01 02 03 40 41 42 43
+ // b1.val[0]: 10 11 12 13 50 51 52 53
+ // b2.val[0]: 20 21 22 23 60 61 62 63
+ // b3.val[0]: 30 31 32 33 70 71 72 73
+
+ const uint32x2x2_t b0 =
+ vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+ const uint32x2x2_t b1 =
+ vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+ const uint32x2x2_t b2 =
+ vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+ const uint32x2x2_t b3 =
+ vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 40 41 60 61
+ // c0.val[1]: 02 03 22 23 42 43 62 63
+ // c1.val[0]: 10 11 30 31 50 51 70 71
+ // c1.val[1]: 12 13 32 33 52 53 72 73
+
+ const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+ vreinterpret_u16_u32(b2.val[0]));
+ const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+ vreinterpret_u16_u32(b3.val[0]));
+
+ // Swap 8 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 01 11 21 31 41 51 61 71
+ // d1.val[0]: 02 12 22 32 42 52 62 72
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const uint8x8x2_t d0 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+ const uint8x8x2_t d1 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+ *a2 = d1.val[0];
+ *a3 = d1.val[1];
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+ int32x4_t *a2, int32x4_t *a3) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+ const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+ *a0 = c0.val[0];
+ *a1 = c1.val[0];
+ *a2 = c0.val[1];
+ *a3 = c1.val[1];
+}
+
+static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
+ const int16x4_t a2, const int16x4_t a3,
+ const int16x4_t a4, const int16x4_t a5,
+ const int16x4_t a6, const int16x4_t a7,
+ int16x8_t *const o0, int16x8_t *const o1,
+ int16x8_t *const o2, int16x8_t *const o3) {
+ // Combine rows. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0: 00 01 02 03 40 41 42 43
+ // b1: 10 11 12 13 50 51 52 53
+ // b2: 20 21 22 23 60 61 62 63
+ // b3: 30 31 32 33 70 71 72 73
+
+ const int16x8_t b0 = vcombine_s16(a0, a4);
+ const int16x8_t b1 = vcombine_s16(a1, a5);
+ const int16x8_t b2 = vcombine_s16(a2, a6);
+ const int16x8_t b3 = vcombine_s16(a3, a7);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 02 12 40 50 42 52
+ // c0.val[1]: 01 11 03 13 41 51 43 53
+ // c1.val[0]: 20 30 22 32 60 70 62 72
+ // c1.val[1]: 21 31 23 33 61 71 63 73
+
+ const int16x8x2_t c0 = vtrnq_s16(b0, b1);
+ const int16x8x2_t c1 = vtrnq_s16(b2, b3);
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 02 12 22 32 42 52 62 72
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+
+ *o0 = vreinterpretq_s16_s32(d0.val[0]);
+ *o1 = vreinterpretq_s16_s32(d1.val[0]);
+ *o2 = vreinterpretq_s16_s32(d0.val[1]);
+ *o3 = vreinterpretq_s16_s32(d1.val[1]);
+}
+
+static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
+ int32x4_t *const a2, int32x4_t *const a3,
+ int32x4_t *const a4, int32x4_t *const a5,
+ int32x4_t *const a6, int32x4_t *const a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+ // b2.val[0]: 40 50 42 52
+ // b2.val[1]: 41 51 43 53
+ // b3.val[0]: 60 70 62 72
+ // b3.val[1]: 61 71 63 73
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+ const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+ const int32x4x2_t b2 = vtrnq_s32(*a4, *a5);
+ const int32x4x2_t b3 = vtrnq_s32(*a6, *a7);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 40 50 60 70
+ // c2.val[1]: 42 52 62 72
+ // c3.val[0]: 41 51 61 71
+ // c3.val[1]: 43 53 63 73
+
+ const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]);
+ const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]);
+ const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]);
+ const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]);
+
+ *a0 = vreinterpretq_s32_s64(c0.val[0]);
+ *a1 = vreinterpretq_s32_s64(c2.val[0]);
+ *a2 = vreinterpretq_s32_s64(c1.val[0]);
+ *a3 = vreinterpretq_s32_s64(c3.val[0]);
+ *a4 = vreinterpretq_s32_s64(c0.val[1]);
+ *a5 = vreinterpretq_s32_s64(c2.val[1]);
+ *a6 = vreinterpretq_s32_s64(c1.val[1]);
+ *a7 = vreinterpretq_s32_s64(c3.val[1]);
+}
+
+static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+
+ const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+ const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+
+ const uint16x4x2_t c0 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+ const uint16x4x2_t c1 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+ *a0 = vreinterpret_u8_u16(c0.val[0]);
+ *a1 = vreinterpret_u8_u16(c1.val[0]);
+ *a2 = vreinterpret_u8_u16(c0.val[1]);
+ *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1,
+ uint16x8_t *a2, uint16x8_t *a3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+
+ const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+ const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+
+ *a0 = vreinterpretq_u16_u32(c0.val[0]);
+ *a1 = vreinterpretq_u16_u32(c1.val[0]);
+ *a2 = vreinterpretq_u16_u32(c0.val[1]);
+ *a3 = vreinterpretq_u16_u32(c1.val[1]);
+}
+
+static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
+ int32x4_t *const a2, int32x4_t *const a3,
+ int32x4_t *const a4, int32x4_t *const a5,
+ int32x4_t *const a6, int32x4_t *const a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 04 05 06 07
+ // a2: 10 11 12 13
+ // a3: 14 15 16 17
+ // a4: 20 21 22 23
+ // a5: 24 25 26 27
+ // a6: 30 31 32 33
+ // a7: 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 04 14 06 16
+ // b1.val[1]: 05 15 07 17
+ // b2.val[0]: 20 30 22 32
+ // b2.val[1]: 21 31 23 33
+ // b3.val[0]: 24 34 26 36
+ // b3.val[1]: 25 35 27 37
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a2);
+ const int32x4x2_t b1 = vtrnq_s32(*a1, *a3);
+ const int32x4x2_t b2 = vtrnq_s32(*a4, *a6);
+ const int32x4x2_t b3 = vtrnq_s32(*a5, *a7);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 04 14 24 34
+ // c2.val[1]: 06 16 26 36
+ // c3.val[0]: 05 15 25 35
+ // c3.val[1]: 07 17 27 37
+
+ const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]);
+ const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]);
+ const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]);
+ const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]);
+
+ *a0 = vreinterpretq_s32_s64(c0.val[0]);
+ *a1 = vreinterpretq_s32_s64(c1.val[0]);
+ *a2 = vreinterpretq_s32_s64(c0.val[1]);
+ *a3 = vreinterpretq_s32_s64(c1.val[1]);
+ *a4 = vreinterpretq_s32_s64(c2.val[0]);
+ *a5 = vreinterpretq_s32_s64(c3.val[0]);
+ *a6 = vreinterpretq_s32_s64(c2.val[1]);
+ *a7 = vreinterpretq_s32_s64(c3.val[1]);
+}
+
+// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
+// 'q' registers here to save some instructions.
+static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
+ uint8x8_t *a6, uint8x8_t *a7) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
+ // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
+ // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
+ // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
+
+ const uint8x16x2_t b0 =
+ vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
+ const uint8x16x2_t b1 =
+ vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
+ // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
+ // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
+ // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
+
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+
+ // Unzip 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+ *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+ *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+ *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+ *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+ *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+ *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+// Transpose 8x8 to a new location.
+static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ out[0] = d0.val[0];
+ out[1] = d1.val[0];
+ out[2] = d2.val[0];
+ out[3] = d3.val[0];
+ out[4] = d0.val[1];
+ out[5] = d1.val[1];
+ out[6] = d2.val[1];
+ out[7] = d3.val[1];
+}
+
+static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2, int16x8_t *a3,
+ int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6, int16x8_t *a7) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+ const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+ const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
+ const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d1.val[0];
+ *a2 = d2.val[0];
+ *a3 = d3.val[0];
+ *a4 = d0.val[1];
+ *a5 = d1.val[1];
+ *a6 = d2.val[1];
+ *a7 = d3.val[1];
+}
+
+static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
+ uint16x8_t *a2, uint16x8_t *a3,
+ uint16x8_t *a4, uint16x8_t *a5,
+ uint16x8_t *a6, uint16x8_t *a7) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+ const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+ const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+ const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+ const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+ vreinterpretq_u32_u16(b3.val[0]));
+ const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+ vreinterpretq_u32_u16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
+ const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
+ const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
+ const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d1.val[0];
+ *a2 = d2.val[0];
+ *a3 = d3.val[0];
+ *a4 = d0.val[1];
+ *a5 = d1.val[1];
+ *a6 = d2.val[1];
+ *a7 = d3.val[1];
+}
+
+static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
+ int32x4x2_t *a2, int32x4x2_t *a3,
+ int32x4x2_t *a4, int32x4x2_t *a5,
+ int32x4x2_t *a6, int32x4x2_t *a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0: 00 10 02 12 01 11 03 13
+ // b1: 20 30 22 32 21 31 23 33
+ // b2: 40 50 42 52 41 51 43 53
+ // b3: 60 70 62 72 61 71 63 73
+ // b4: 04 14 06 16 05 15 07 17
+ // b5: 24 34 26 36 25 35 27 37
+ // b6: 44 54 46 56 45 55 47 57
+ // b7: 64 74 66 76 65 75 67 77
+
+ const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
+ const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
+ const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
+ const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
+ const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
+ const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
+ const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
+ const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
+
+ // Swap 64 bit elements resulting in:
+ // c0: 00 10 20 30 02 12 22 32
+ // c1: 01 11 21 31 03 13 23 33
+ // c2: 40 50 60 70 42 52 62 72
+ // c3: 41 51 61 71 43 53 63 73
+ // c4: 04 14 24 34 06 16 26 36
+ // c5: 05 15 25 35 07 17 27 37
+ // c6: 44 54 64 74 46 56 66 76
+ // c7: 45 55 65 75 47 57 67 77
+ const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+ const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
+ const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
+ const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
+ const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
+ const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
+ const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
+
+ // Swap 128 bit elements resulting in:
+ // a0: 00 10 20 30 40 50 60 70
+ // a1: 01 11 21 31 41 51 61 71
+ // a2: 02 12 22 32 42 52 62 72
+ // a3: 03 13 23 33 43 53 63 73
+ // a4: 04 14 24 34 44 54 64 74
+ // a5: 05 15 25 35 45 55 65 75
+ // a6: 06 16 26 36 46 56 66 76
+ // a7: 07 17 27 37 47 57 67 77
+ a0->val[0] = c0.val[0];
+ a0->val[1] = c2.val[0];
+ a1->val[0] = c1.val[0];
+ a1->val[1] = c3.val[0];
+ a2->val[0] = c0.val[1];
+ a2->val[1] = c2.val[1];
+ a3->val[0] = c1.val[1];
+ a3->val[1] = c3.val[1];
+ a4->val[0] = c4.val[0];
+ a4->val[1] = c6.val[0];
+ a5->val[0] = c5.val[0];
+ a5->val[1] = c7.val[0];
+ a6->val[0] = c4.val[1];
+ a6->val[1] = c6.val[1];
+ a7->val[0] = c5.val[1];
+ a7->val[1] = c7.val[1];
+}
+
+// Helper transpose function for highbd FDCT variants
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+ int32x4_t *right /*[8]*/,
+ int32x4_t *out_left /*[8]*/,
+ int32x4_t *out_right /*[8]*/) {
+ int32x4x2_t out[8];
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ out_left[0] = out[0].val[0];
+ out_left[1] = out[1].val[0];
+ out_left[2] = out[2].val[0];
+ out_left[3] = out[3].val[0];
+ out_left[4] = out[4].val[0];
+ out_left[5] = out[5].val[0];
+ out_left[6] = out[6].val[0];
+ out_left[7] = out[7].val[0];
+ out_right[0] = out[0].val[1];
+ out_right[1] = out[1].val[1];
+ out_right[2] = out[2].val[1];
+ out_right[3] = out[3].val[1];
+ out_right[4] = out[4].val[1];
+ out_right[5] = out[5].val[1];
+ out_right[6] = out[6].val[1];
+ out_right[7] = out[7].val[1];
+}
+
+static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1,
+ int32x4_t *left2, int32x4_t *right2) {
+ int32x4_t tl[16], tr[16];
+
+ // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+ tl[0] = left1[8];
+ tl[1] = left1[9];
+ tl[2] = left1[10];
+ tl[3] = left1[11];
+ tl[4] = left1[12];
+ tl[5] = left1[13];
+ tl[6] = left1[14];
+ tl[7] = left1[15];
+ tr[0] = right1[8];
+ tr[1] = right1[9];
+ tr[2] = right1[10];
+ tr[3] = right1[11];
+ tr[4] = right1[12];
+ tr[5] = right1[13];
+ tr[6] = right1[14];
+ tr[7] = right1[15];
+
+ left1[8] = left2[0];
+ left1[9] = left2[1];
+ left1[10] = left2[2];
+ left1[11] = left2[3];
+ left1[12] = left2[4];
+ left1[13] = left2[5];
+ left1[14] = left2[6];
+ left1[15] = left2[7];
+ right1[8] = right2[0];
+ right1[9] = right2[1];
+ right1[10] = right2[2];
+ right1[11] = right2[3];
+ right1[12] = right2[4];
+ right1[13] = right2[5];
+ right1[14] = right2[6];
+ right1[15] = right2[7];
+
+ left2[0] = tl[0];
+ left2[1] = tl[1];
+ left2[2] = tl[2];
+ left2[3] = tl[3];
+ left2[4] = tl[4];
+ left2[5] = tl[5];
+ left2[6] = tl[6];
+ left2[7] = tl[7];
+ right2[0] = tr[0];
+ right2[1] = tr[1];
+ right2[2] = tr[2];
+ right2[3] = tr[3];
+ right2[4] = tr[4];
+ right2[5] = tr[5];
+ right2[6] = tr[6];
+ right2[7] = tr[7];
+
+ transpose_s32_8x8_2(left1, right1, left1, right1);
+ transpose_s32_8x8_2(left2, right2, left2, right2);
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8);
+}
+
+static INLINE void transpose_u8_16x8(
+ const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
+ const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
+ const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1,
+ uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
+ uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11,
+ uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) {
+ // Swap 8 bit elements. Goes from:
+ // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
+ // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
+ // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
+ // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
+ // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
+ // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
+ // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
+ // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
+ // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
+ // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
+ // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
+ const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
+ const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
+ const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
+ const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
+ // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
+ // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
+ // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
+ // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
+ // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
+ // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
+ // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+ const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+ vreinterpretq_u16_u8(b3.val[0]));
+ const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+ vreinterpretq_u16_u8(b3.val[1]));
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
+ // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
+ // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
+ // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
+ // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
+ // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
+ // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
+ // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
+ const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c2.val[0]));
+ const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c2.val[1]));
+ const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+ vreinterpretq_u32_u16(c3.val[0]));
+ const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+ vreinterpretq_u32_u16(c3.val[1]));
+
+ // Output:
+ // o0 : 00 10 20 30 40 50 60 70
+ // o1 : 01 11 21 31 41 51 61 71
+ // o2 : 02 12 22 32 42 52 62 72
+ // o3 : 03 13 23 33 43 53 63 73
+ // o4 : 04 14 24 34 44 54 64 74
+ // o5 : 05 15 25 35 45 55 65 75
+ // o6 : 06 16 26 36 46 56 66 76
+ // o7 : 07 17 27 37 47 57 67 77
+ // o8 : 08 18 28 38 48 58 68 78
+ // o9 : 09 19 29 39 49 59 69 79
+ // o10: 0A 1A 2A 3A 4A 5A 6A 7A
+ // o11: 0B 1B 2B 3B 4B 5B 6B 7B
+ // o12: 0C 1C 2C 3C 4C 5C 6C 7C
+ // o13: 0D 1D 2D 3D 4D 5D 6D 7D
+ // o14: 0E 1E 2E 3E 4E 5E 6E 7E
+ // o15: 0F 1F 2F 3F 4F 5F 6F 7F
+ *o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
+ *o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
+ *o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
+ *o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
+ *o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
+ *o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
+ *o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
+ *o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
+ *o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
+ *o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
+ *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
+ *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
+ *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
+ *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
+ *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
+ *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
+}
+
+static INLINE void transpose_u8_8x16(
+ const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2,
+ const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5,
+ const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8,
+ const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11,
+ const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14,
+ const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
+ uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
+ uint8x16_t *o7) {
+ // Combine 8 bit elements. Goes from:
+ // i0 : 00 01 02 03 04 05 06 07
+ // i1 : 10 11 12 13 14 15 16 17
+ // i2 : 20 21 22 23 24 25 26 27
+ // i3 : 30 31 32 33 34 35 36 37
+ // i4 : 40 41 42 43 44 45 46 47
+ // i5 : 50 51 52 53 54 55 56 57
+ // i6 : 60 61 62 63 64 65 66 67
+ // i7 : 70 71 72 73 74 75 76 77
+ // i8 : 80 81 82 83 84 85 86 87
+ // i9 : 90 91 92 93 94 95 96 97
+ // i10: A0 A1 A2 A3 A4 A5 A6 A7
+ // i11: B0 B1 B2 B3 B4 B5 B6 B7
+ // i12: C0 C1 C2 C3 C4 C5 C6 C7
+ // i13: D0 D1 D2 D3 D4 D5 D6 D7
+ // i14: E0 E1 E2 E3 E4 E5 E6 E7
+ // i15: F0 F1 F2 F3 F4 F5 F6 F7
+ // to:
+ // a0: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87
+ // a1: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97
+ // a2: 20 21 22 23 24 25 26 27 A0 A1 A2 A3 A4 A5 A6 A7
+ // a3: 30 31 32 33 34 35 36 37 B0 B1 B2 B3 B4 B5 B6 B7
+ // a4: 40 41 42 43 44 45 46 47 C0 C1 C2 C3 C4 C5 C6 C7
+ // a5: 50 51 52 53 54 55 56 57 D0 D1 D2 D3 D4 D5 D6 D7
+ // a6: 60 61 62 63 64 65 66 67 E0 E1 E2 E3 E4 E5 E6 E7
+ // a7: 70 71 72 73 74 75 76 77 F0 F1 F2 F3 F4 F5 F6 F7
+ const uint8x16_t a0 = vcombine_u8(i0, i8);
+ const uint8x16_t a1 = vcombine_u8(i1, i9);
+ const uint8x16_t a2 = vcombine_u8(i2, i10);
+ const uint8x16_t a3 = vcombine_u8(i3, i11);
+ const uint8x16_t a4 = vcombine_u8(i4, i12);
+ const uint8x16_t a5 = vcombine_u8(i5, i13);
+ const uint8x16_t a6 = vcombine_u8(i6, i14);
+ const uint8x16_t a7 = vcombine_u8(i7, i15);
+
+ // Swap 8 bit elements resulting in:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
+ // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
+ // b1.val[0]: 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6
+ // b1.val[1]: 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7
+ // b2.val[0]: 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6
+ // b2.val[1]: 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7
+ // b3.val[0]: 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6
+ // b3.val[1]: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7
+ const uint8x16x2_t b0 = vtrnq_u8(a0, a1);
+ const uint8x16x2_t b1 = vtrnq_u8(a2, a3);
+ const uint8x16x2_t b2 = vtrnq_u8(a4, a5);
+ const uint8x16x2_t b3 = vtrnq_u8(a6, a7);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4
+ // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6
+ // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5
+ // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7
+ // c2.val[0]: 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4
+ // c2.val[1]: 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6
+ // c3.val[0]: 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5
+ // c3.val[1]: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+ const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+ vreinterpretq_u16_u8(b3.val[0]));
+ const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+ vreinterpretq_u16_u8(b3.val[1]));
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
+ // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
+ // d1.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
+ // d1.val[1]: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
+ // d2.val[0]: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
+ // d2.val[1]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
+ // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
+ // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
+ const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c2.val[0]));
+ const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c2.val[1]));
+ const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+ vreinterpretq_u32_u16(c3.val[0]));
+ const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+ vreinterpretq_u32_u16(c3.val[1]));
+
+ // Output:
+ // o0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
+ // o1: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
+ // o2: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
+ // o3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
+ // o4: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
+ // o5: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
+ // o6: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
+ // o7: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
+ *o0 = vreinterpretq_u8_u32(d0.val[0]);
+ *o1 = vreinterpretq_u8_u32(d2.val[0]);
+ *o2 = vreinterpretq_u8_u32(d1.val[0]);
+ *o3 = vreinterpretq_u8_u32(d3.val[0]);
+ *o4 = vreinterpretq_u8_u32(d0.val[1]);
+ *o5 = vreinterpretq_u8_u32(d2.val[1]);
+ *o6 = vreinterpretq_u8_u32(d1.val[1]);
+ *o7 = vreinterpretq_u8_u32(d3.val[1]);
+}
+
+static INLINE void transpose_u8_16x16(
+ const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
+ const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
+ const uint8x16_t i6, const uint8x16_t i7, const uint8x16_t i8,
+ const uint8x16_t i9, const uint8x16_t i10, const uint8x16_t i11,
+ const uint8x16_t i12, const uint8x16_t i13, const uint8x16_t i14,
+ const uint8x16_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
+ uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
+ uint8x16_t *o7, uint8x16_t *o8, uint8x16_t *o9, uint8x16_t *o10,
+ uint8x16_t *o11, uint8x16_t *o12, uint8x16_t *o13, uint8x16_t *o14,
+ uint8x16_t *o15) {
+ // Swap 8 bit elements. Goes from:
+ // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
+ // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
+ // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
+ // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
+ // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
+ // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
+ // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
+ // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
+ // i8: 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F
+ // i9: 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F
+ // i10: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF
+ // i11: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF
+ // i12: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF
+ // i13: D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF
+ // i14: E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF
+ // i15: F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
+ // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
+ // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
+ // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
+ // b4.val[0]: 80 90 82 92 84 94 86 96 88 98 8A 9A 8C 9C 8E 9E
+ // b4.val[1]: 81 91 83 93 85 95 87 97 89 99 8B 9B 8D 9D 8F 9F
+ // b5.val[0]: A0 B0 A2 B2 A4 B4 A6 B6 A8 B8 AA BA AC BC AE BE
+ // b5.val[1]: A1 B1 A3 B3 A5 B5 A7 B7 A9 B9 AB BB AD BD AF BF
+ // b6.val[0]: C0 D0 C2 D2 C4 D4 C6 D6 C8 D8 CA DA CC DC CE DE
+ // b6.val[1]: C1 D1 C3 D3 C5 D5 C7 D7 C9 D9 CB DB CD DD CF DF
+ // b7.val[0]: E0 F0 E2 F2 E4 F4 E6 F6 E8 F8 EA FA EC FC EE FE
+ // b7.val[1]: E1 F1 E3 F3 E5 F5 E7 F7 E9 F9 EB FB ED FD EF FF
+ const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
+ const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
+ const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
+ const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
+ const uint8x16x2_t b4 = vtrnq_u8(i8, i9);
+ const uint8x16x2_t b5 = vtrnq_u8(i10, i11);
+ const uint8x16x2_t b6 = vtrnq_u8(i12, i13);
+ const uint8x16x2_t b7 = vtrnq_u8(i14, i15);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
+ // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
+ // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
+ // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
+ // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
+ // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
+ // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
+ // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
+ // c4.val[0]: 80 90 A0 B0 84 94 A4 B4 88 98 A8 B8 8C 9C AC BC
+ // c4.val[1]: 82 92 A2 B2 86 96 A6 B6 8A 9A AA BA 8E 9E AE BE
+ // c5.val[0]: 81 91 A1 B1 85 95 A5 B5 89 99 A9 B9 8D 9D AD BD
+ // c5.val[1]: 83 93 A3 B3 87 97 A7 B7 8B 9B AB BB 8F 9F AF BF
+ // c6.val[0]: C0 D0 E0 F0 C4 D4 E4 F4 C8 D8 E8 F8 CC DC EC FC
+ // c6.val[1]: C2 D2 E2 F2 C6 D6 E6 F6 CA DA EA FA CE DE EE FE
+ // c7.val[0]: C1 D1 E1 F1 C5 D5 E5 F5 C9 D9 E9 F9 CD DD ED FD
+ // c7.val[1]: C3 D3 E3 F3 C7 D7 E7 F7 CB DB EB FB CF DF EF FF
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+ const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+ vreinterpretq_u16_u8(b3.val[0]));
+ const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+ vreinterpretq_u16_u8(b3.val[1]));
+ const uint16x8x2_t c4 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[0]),
+ vreinterpretq_u16_u8(b5.val[0]));
+ const uint16x8x2_t c5 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[1]),
+ vreinterpretq_u16_u8(b5.val[1]));
+ const uint16x8x2_t c6 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[0]),
+ vreinterpretq_u16_u8(b7.val[0]));
+ const uint16x8x2_t c7 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[1]),
+ vreinterpretq_u16_u8(b7.val[1]));
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
+ // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
+ // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
+ // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
+ // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
+ // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
+ // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
+ // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
+ // d4.val[0]: 80 90 A0 B0 C0 D0 E0 F0 88 98 A8 B8 C8 D8 E8 F8
+ // d4.val[1]: 84 94 A4 B4 C4 D4 E4 F4 8C 9C AC BC CC DC EC FC
+ // d5.val[0]: 82 92 A2 B2 C2 D2 E2 F2 8A 9A AA BA CA DA EA FA
+ // d5.val[1]: 86 96 A6 B6 C6 D6 E6 F6 8E 9E AE BE CE DE EE FE
+ // d6.val[0]: 81 91 A1 B1 C1 D1 E1 F1 89 99 A9 B9 C9 D9 E9 F9
+ // d6.val[1]: 85 95 A5 B5 C5 D5 E5 F5 8D 9D AD BD CD DD ED FD
+ // d7.val[0]: 83 93 A3 B3 C3 D3 E3 F3 8B 9B AB BB CB DB EB FB
+ // d7.val[1]: 87 97 A7 B7 C7 D7 E7 F7 8F 9F AF BF CF DF EF FF
+ const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c2.val[0]));
+ const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c2.val[1]));
+ const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+ vreinterpretq_u32_u16(c3.val[0]));
+ const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+ vreinterpretq_u32_u16(c3.val[1]));
+ const uint32x4x2_t d4 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[0]),
+ vreinterpretq_u32_u16(c6.val[0]));
+ const uint32x4x2_t d5 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[1]),
+ vreinterpretq_u32_u16(c6.val[1]));
+ const uint32x4x2_t d6 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[0]),
+ vreinterpretq_u32_u16(c7.val[0]));
+ const uint32x4x2_t d7 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[1]),
+ vreinterpretq_u32_u16(c7.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // e0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
+ // e0.val[1]: 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
+ // e1.val[0]: 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
+ // e1.val[1]: 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
+ // e2.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
+ // e2.val[1]: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
+ // e3.val[0]: 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
+ // e3.val[1]: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
+ // e4.val[0]: 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
+ // e4.val[1]: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
+ // e5.val[0]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
+ // e5.val[1]: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
+ // e6.val[0]: 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
+ // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
+ // e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
+ // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
+ const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]);
+ const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]);
+ const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]);
+ const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]);
+ const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]);
+ const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]);
+ const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]);
+ const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]);
+
+ // Output:
+ // o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
+ // o1 : 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
+ // o2 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
+ // o3 : 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
+ // o4 : 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
+ // o5 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
+ // o6 : 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
+ // o7 : 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
+ // o8 : 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
+ // o9 : 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
+ // o10: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
+ // o11: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
+ // o12: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
+ // o13: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
+ // o14: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
+ // o15: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
+ *o0 = e0.val[0];
+ *o1 = e1.val[0];
+ *o2 = e2.val[0];
+ *o3 = e3.val[0];
+ *o4 = e4.val[0];
+ *o5 = e5.val[0];
+ *o6 = e6.val[0];
+ *o7 = e7.val[0];
+ *o8 = e0.val[1];
+ *o9 = e1.val[1];
+ *o10 = e2.val[1];
+ *o11 = e3.val[1];
+ *o12 = e4.val[1];
+ *o13 = e5.val[1];
+ *o14 = e6.val[1];
+ *o15 = e7.val[1];
+}
+
+static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
+ int16x8_t t[8];
+
+ // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+ t[0] = in0[8];
+ t[1] = in0[9];
+ t[2] = in0[10];
+ t[3] = in0[11];
+ t[4] = in0[12];
+ t[5] = in0[13];
+ t[6] = in0[14];
+ t[7] = in0[15];
+ in0[8] = in1[0];
+ in0[9] = in1[1];
+ in0[10] = in1[2];
+ in0[11] = in1[3];
+ in0[12] = in1[4];
+ in0[13] = in1[5];
+ in0[14] = in1[6];
+ in0[15] = in1[7];
+ in1[0] = t[0];
+ in1[1] = t[1];
+ in1[2] = t[2];
+ in1[3] = t[3];
+ in1[4] = t[4];
+ in1[5] = t[5];
+ in1[6] = t[6];
+ in1[7] = t[7];
+
+ transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
+ &in0[6], &in0[7]);
+ transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
+ &in0[14], &in0[15]);
+ transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
+ &in1[6], &in1[7]);
+ transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
+ &in1[14], &in1[15]);
+}
+
+static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
+ const int a_stride, uint8x8_t *a0,
+ uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3) {
+ uint8x8_t a4, a5, a6, a7;
+ *a0 = vld1_u8(a);
+ a += a_stride;
+ *a1 = vld1_u8(a);
+ a += a_stride;
+ *a2 = vld1_u8(a);
+ a += a_stride;
+ *a3 = vld1_u8(a);
+ a += a_stride;
+ a4 = vld1_u8(a);
+ a += a_stride;
+ a5 = vld1_u8(a);
+ a += a_stride;
+ a6 = vld1_u8(a);
+ a += a_stride;
+ a7 = vld1_u8(a);
+
+ transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void load_and_transpose_u8_8x8(const uint8_t *a,
+ const int a_stride, uint8x8_t *a0,
+ uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3, uint8x8_t *a4,
+ uint8x8_t *a5, uint8x8_t *a6,
+ uint8x8_t *a7) {
+ *a0 = vld1_u8(a);
+ a += a_stride;
+ *a1 = vld1_u8(a);
+ a += a_stride;
+ *a2 = vld1_u8(a);
+ a += a_stride;
+ *a3 = vld1_u8(a);
+ a += a_stride;
+ *a4 = vld1_u8(a);
+ a += a_stride;
+ *a5 = vld1_u8(a);
+ a += a_stride;
+ *a6 = vld1_u8(a);
+ a += a_stride;
+ *a7 = vld1_u8(a);
+
+ transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride,
+ uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x8_t a4, uint8x8_t a5,
+ uint8x8_t a6, uint8x8_t a7) {
+ transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ vst1_u8(a, a0);
+ a += a_stride;
+ vst1_u8(a, a1);
+ a += a_stride;
+ vst1_u8(a, a2);
+ a += a_stride;
+ vst1_u8(a, a3);
+ a += a_stride;
+ vst1_u8(a, a4);
+ a += a_stride;
+ vst1_u8(a, a5);
+ a += a_stride;
+ vst1_u8(a, a6);
+ a += a_stride;
+ vst1_u8(a, a7);
+}
+
+static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
+ const int a_stride, int16x8_t *a0,
+ int16x8_t *a1, int16x8_t *a2,
+ int16x8_t *a3, int16x8_t *a4,
+ int16x8_t *a5, int16x8_t *a6,
+ int16x8_t *a7) {
+ *a0 = vld1q_s16(a);
+ a += a_stride;
+ *a1 = vld1q_s16(a);
+ a += a_stride;
+ *a2 = vld1q_s16(a);
+ a += a_stride;
+ *a3 = vld1q_s16(a);
+ a += a_stride;
+ *a4 = vld1q_s16(a);
+ a += a_stride;
+ *a5 = vld1q_s16(a);
+ a += a_stride;
+ *a6 = vld1q_s16(a);
+ a += a_stride;
+ *a7 = vld1q_s16(a);
+
+ transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void load_and_transpose_s32_8x8(
+ const int32_t *a, const int a_stride, int32x4x2_t *const a0,
+ int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3,
+ int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6,
+ int32x4x2_t *const a7) {
+ a0->val[0] = vld1q_s32(a);
+ a0->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a1->val[0] = vld1q_s32(a);
+ a1->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a2->val[0] = vld1q_s32(a);
+ a2->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a3->val[0] = vld1q_s32(a);
+ a3->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a4->val[0] = vld1q_s32(a);
+ a4->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a5->val[0] = vld1q_s32(a);
+ a5->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a6->val[0] = vld1q_s32(a);
+ a6->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a7->val[0] = vld1q_s32(a);
+ a7->val[1] = vld1q_s32(a + 4);
+
+ transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+#endif // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c
new file mode 100644
index 0000000000..69ff1cf153
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+// Process a block of width 4 four rows at a time.
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ src_ptr += 4 * src_stride;
+ ref_ptr += 4 * ref_stride;
+ i -= 4;
+ } while (i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 8 two rows at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ const uint8x16_t s =
+ vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
+ const uint8x16_t r =
+ vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr + j);
+ const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ j += 16;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+// Process a block of width 4 two rows at a time.
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ int i = h;
+
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
+ assert(h <= 256);
+
+ do {
+ const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_int16x8(sum_s16);
+ *sse = (uint32_t)horizontal_add_int32x4(sse_s32);
+}
+
+// Process a block of width 8 one row at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int i = h;
+
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128
+ assert(h <= 128);
+
+ do {
+ const uint8x8_t s = vld1_u8(src_ptr);
+ const uint8x8_t r = vld1_u8(ref_ptr);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int16x8(sum_s16);
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int i = h;
+
+ // Number of rows we can process before 'sum_s16' accumulators overflow:
+ // 32767 / 255 ~= 128, so 128 16-wide rows.
+ assert(h <= 128);
+
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ const int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int w, int h, int h_limit,
+ unsigned int *sse, int *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+ // accumulator overflows. After hitting this limit we accumulate into 32-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ int i = 0;
+ do {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr + j);
+ const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+ const int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ const int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ j += 16;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ i++;
+ } while (i < h_tmp);
+
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+ h_tmp += h_limit;
+ } while (i < h);
+
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
+}
+
+void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum);
+}
+
+#define VARIANCE_WXH_NEON(w, h, shift) \
+ unsigned int vpx_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ }
+
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
+
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
+
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
+
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x8_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ s1 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ r0 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ r1 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+
+ diff0 = vabd_u8(s0, r0);
+ diff1 = vabd_u8(s1, r1);
+
+ sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0);
+ sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ s1 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ r0 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ r1 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+
+ diff0 = vabdq_u8(s0, r0);
+ diff1 = vabdq_u8(s1, r1);
+
+ sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0);
+ sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+ uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+
+ uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff);
+
+ return horizontal_add_uint32x4(sse);
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x8_t s0, s1, r0, r1, diff0, diff1;
+ uint16x8_t sse0, sse1;
+
+ s0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ s1 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ r0 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ r1 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+
+ diff0 = vabd_u8(s0, r0);
+ diff1 = vabd_u8(s1, r1);
+
+ sse0 = vmull_u8(diff0, diff0);
+ sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+ sse1 = vmull_u8(diff1, diff1);
+ sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s, r, diff;
+ uint16x8_t sse0, sse1;
+
+ s = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ r = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+
+ diff = vabdq_u8(s, r);
+
+ sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff));
+ sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+ sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff));
+ sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x8_t s[2], r[2];
+ uint16x8_t abs_diff[2];
+ uint32x4_t sse;
+
+ s[0] = load_u8(src_ptr, src_stride);
+ r[0] = load_u8(ref_ptr, ref_stride);
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ s[1] = load_u8(src_ptr, src_stride);
+ r[1] = load_u8(ref_ptr, ref_stride);
+
+ abs_diff[0] = vabdl_u8(s[0], r[0]);
+ abs_diff[1] = vabdl_u8(s[1], r[1]);
+
+ sse = vmull_u16(vget_low_u16(abs_diff[0]), vget_low_u16(abs_diff[0]));
+ sse = vmlal_u16(sse, vget_high_u16(abs_diff[0]), vget_high_u16(abs_diff[0]));
+ sse = vmlal_u16(sse, vget_low_u16(abs_diff[1]), vget_low_u16(abs_diff[1]));
+ sse = vmlal_u16(sse, vget_high_u16(abs_diff[1]), vget_high_u16(abs_diff[1]));
+
+ return horizontal_add_uint32x4(sse);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+#define VPX_MSE_WXH_NEON(w, h) \
+ unsigned int vpx_mse##w##x##h##_neon( \
+ const unsigned char *src_ptr, int src_stride, \
+ const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \
+ return vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h, \
+ sse); \
+ }
+
+VPX_MSE_WXH_NEON(8, 8)
+VPX_MSE_WXH_NEON(8, 16)
+VPX_MSE_WXH_NEON(16, 8)
+VPX_MSE_WXH_NEON(16, 16)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
new file mode 100644
index 0000000000..d8e4bcc3a7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
@@ -0,0 +1,438 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers*****************************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_avg_horiz_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u8 {d6}, [r1]
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u8 {d7}, [r6]
+ vrhadd.u8 d20, d20, d6
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vrhadd.u8 d8, d8, d7
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlsl.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlal.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlal.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ add r7, r1, #8
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vld1.u8 {d0}, [r1]
+ vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u8 {d2}, [r7]
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vrhadd.u8 d8, d8, d0
+ vrhadd.u8 d9, d9, d2
+ vmlsl.u8 q11, d1, d24
+ vmlsl.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlal.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ vmlal.u8 q11, d13, d28
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ subeq r14, r14, #2
+ vhadd.s16 q5, q5, q10
+ vmlal.u8 q11, d15, d29
+ addeq r1, r1, r8
+ vmlsl.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vdup.16 q10, r7
+ vld1.u32 {q3}, [r12], r11
+ add r7, r6, #8
+ moveq r5, r10
+ vld1.u8 {d0}, [r6]
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u8 {d2}, [r7]
+ vqrshrun.s16 d11, q11, #6
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q6}, [r12], r11
+ vrhadd.u8 d10, d10, d0
+ vld1.u32 {q7}, [r12], r11
+ vrhadd.u8 d11, d11, d2
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ mov r7, #0xc000
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ add r7, r6, #8
+ vld1.u8 {d20}, [r6]
+ vld1.u8 {d21}, [r7]
+ vrhadd.u8 d10, d10, d20
+ vrhadd.u8 d11, d11, d21
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlal.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlal.u8 q4, d5, d29
+ vmlsl.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vld1.u32 {d10[0]}, [r1]
+ vld1.u32 {d10[1]}, [r6]
+ vrhadd.u8 d8, d8, d10
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
new file mode 100644
index 0000000000..7a77747fec
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
@@ -0,0 +1,439 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_avg_horiz_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u8 {d6}, [r1]
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u8 {d7}, [r6]
+ vrhadd.u8 d20, d20, d6
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vrhadd.u8 d8, d8, d7
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlal.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlsl.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlsl.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlal.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ add r7, r1, #8
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vld1.u8 {d0}, [r1]
+ vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u8 {d2}, [r7]
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vrhadd.u8 d8, d8, d0
+ vrhadd.u8 d9, d9, d2
+ vmlsl.u8 q11, d1, d24
+ vmlal.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlsl.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ vmlal.u8 q11, d13, d28
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ subeq r14, r14, #2
+ vhadd.s16 q5, q5, q10
+ vmlsl.u8 q11, d15, d29
+ addeq r1, r1, r8
+ vmlal.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vdup.16 q10, r7
+ vld1.u32 {q3}, [r12], r11
+ add r7, r6, #8
+ moveq r5, r10
+ vld1.u8 {d0}, [r6]
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u8 {d2}, [r7]
+ vqrshrun.s16 d11, q11, #6
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q6}, [r12], r11
+ vrhadd.u8 d10, d10, d0
+ vld1.u32 {q7}, [r12], r11
+ vrhadd.u8 d11, d11, d2
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ mov r7, #0xc000
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ add r7, r6, #8
+ vld1.u8 {d20}, [r6]
+ vld1.u8 {d21}, [r7]
+ vrhadd.u8 d10, d10, d20
+ vrhadd.u8 d11, d11, d21
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlal.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlsl.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlsl.u8 q4, d5, d29
+ vmlal.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vld1.u32 {d10[0]}, [r1]
+ vld1.u32 {d10[1]}, [r6]
+ vrhadd.u8 d8, d8, d10
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
new file mode 100644
index 0000000000..d310a83dad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
@@ -0,0 +1,486 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_avg_vert_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r3, r3, r2
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ add r14, r1, r6
+ vmlal.u8 q6, d7, d27
+ vmlsl.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlsl.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlal.u8 q7, d5, d24
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d6, d25
+ vrhadd.u8 d10, d10, d20
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d20}, [r14]
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ addle r0, r0, r8
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vrhadd.u8 d12, d12, d20
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ addle r1, r1, r9
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlsl.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ add r10, r10, r2 ; 11*strd
+ vmlal.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlal.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlsl.u8 q6, d16, d28
+ add r10, r10, r2 ;12*strd
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ subs r7, r7, #4
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vld1.u8 {d20}, [r14]
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vrhadd.u8 d12, d12, d20
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vst1.8 {d12}, [r14], r6
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vst1.8 {d14}, [r14], r6
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vmlal.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlal.u8 q6, d7, d27
+ add r14, r1, r6
+ vmlsl.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlal.u8 q7, d16, d27
+ vmlsl.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d12, d12, d20
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d14, d14, d20
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlsl.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlsl.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlal.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlal.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlsl.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vld1.u32 {d20[0]}, [r1]
+ vld1.u32 {d20[1]}, [r3]
+ vrhadd.u8 d0, d0, d20
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ mov r4, r3
+ vld1.u32 {d20[0]}, [r4], r6
+ vld1.u32 {d20[1]}, [r4]
+ vrhadd.u8 d8, d8, d20
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
new file mode 100644
index 0000000000..c5695fbda8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
@@ -0,0 +1,487 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_avg_vert_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r3, r3, r2
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ add r14, r1, r6
+ vmlsl.u8 q6, d7, d27
+ vmlal.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlal.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlsl.u8 q7, d5, d24
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d6, d25
+ vrhadd.u8 d10, d10, d20
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d20}, [r14]
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ addle r0, r0, r8
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vrhadd.u8 d12, d12, d20
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ addle r1, r1, r9
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlal.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ add r10, r10, r2 ; 11*strd
+ vmlsl.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlsl.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlal.u8 q6, d16, d28
+ add r10, r10, r2 ;12*strd
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ subs r7, r7, #4
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vld1.u8 {d20}, [r14]
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vrhadd.u8 d12, d12, d20
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vst1.8 {d12}, [r14], r6
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vst1.8 {d14}, [r14], r6
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vmlsl.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlsl.u8 q6, d7, d27
+ add r14, r1, r6
+ vmlal.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlsl.u8 q7, d16, d27
+ vmlal.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d12, d12, d20
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d14, d14, d20
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlal.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlal.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlsl.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlsl.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlal.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vld1.u32 {d20[0]}, [r1]
+ vld1.u32 {d20[1]}, [r3]
+ vrhadd.u8 d0, d0, d20
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ mov r4, r3
+ vld1.u32 {d20[0]}, [r4], r6
+ vld1.u32 {d20[1]}, [r4]
+ vrhadd.u8 d8, d8, d20
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
new file mode 100644
index 0000000000..fa1b732466
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
@@ -0,0 +1,415 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_horiz_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlsl.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlal.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlal.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vmlsl.u8 q11, d1, d24
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ vmlsl.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlal.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ addeq r1, r1, r8
+ subeq r14, r14, #2
+ vmlal.u8 q11, d13, d28
+ vhadd.s16 q5, q5, q10
+ vmlal.u8 q11, d15, d29
+ vmlsl.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vld1.u32 {q7}, [r12], r11
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q9}, [r12], r11
+ vqrshrun.s16 d11, q11, #6
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ moveq r5, r10
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vdup.16 q10, r7
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlal.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlal.u8 q4, d5, d29
+ vmlsl.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
new file mode 100644
index 0000000000..90b2c8fef7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
@@ -0,0 +1,415 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_horiz_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlal.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlsl.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlsl.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlal.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vmlsl.u8 q11, d1, d24
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ vmlal.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlsl.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ addeq r1, r1, r8
+ subeq r14, r14, #2
+ vmlal.u8 q11, d13, d28
+ vhadd.s16 q5, q5, q10
+ vmlsl.u8 q11, d15, d29
+ vmlal.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vld1.u32 {q7}, [r12], r11
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r12], r11
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q9}, [r12], r11
+ vqrshrun.s16 d11, q11, #6
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ moveq r5, r10
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vdup.16 q10, r7
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlal.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlsl.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlsl.u8 q4, d5, d29
+ vmlal.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
new file mode 100644
index 0000000000..b312cc747c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -0,0 +1,2110 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_ports/mem.h"
+
+// Note:
+// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
+// 2. After refactoring the shared code in kernel loops with inline functions,
+// the decoder speed dropped a lot when using gcc compiler. Therefore there is
+// no refactoring for those parts by now.
+// 3. For horizontal convolve, there is an alternative optimization that
+// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
+// samples in each are read from memory: src, (src+1), (src+2), (src+3),
+// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
+// instructions. This optimization is much faster in speed unit test, but slowed
+// down the whole decoder by 5%.
+
+#if VPX_ARCH_AARCH64 && \
+ (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+ /* Shift left and insert new last column in transposed 4x4 block. */
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+ /* Shift left and insert two new columns in transposed 4x4 block. */
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+ /* Shift left and insert three new columns in transposed 4x4 block. */
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+ t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+ t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+ t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+ d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+ d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+ d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23, dd01, dd23;
+ dd01 = vdup_n_u8(0);
+ dd23 = vdup_n_u8(0);
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+ t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+ t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+ t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+ d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+ d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+ d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b0, uint8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filters);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filters);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filters);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filters);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23, dd01, dd23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filters);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filters);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filters);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filters);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+#else // !defined(__ARM_FEATURE_MATMUL_INT8)
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+ t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+ t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+ t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 =
+ convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+ d1 =
+ convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+ d2 =
+ convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+ d3 =
+ convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23, dd01, dd23;
+ dd01 = vdup_n_u8(0);
+ dd23 = vdup_n_u8(0);
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+ t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+ t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+ t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 =
+ convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+ d1 =
+ convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+ d2 =
+ convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+ d3 =
+ convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b0,
+ int8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x8_t range_limit = vdup_n_u8(128);
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filters);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filters);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filters);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filters);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x8_t range_limit = vdup_n_u8(128);
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23, dd01, dd23;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filters);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filters);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filters);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filters);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+#endif // defined(__ARM_FEATURE_MATMUL_INT8)
+
+#else // !(VPX_ARCH_AARCH64 &&
+ // (defined(__ARM_FEATURE_DOTPROD) ||
+ // defined(__ARM_FEATURE_MATMUL_INT8)))
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+ uint8x8_t t0, t1, t2, t3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (h == 4) {
+ uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ int16x8_t tt0, tt1, tt2, tt3;
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s0 = vget_low_s16(tt0);
+ s1 = vget_low_s16(tt1);
+ s2 = vget_low_s16(tt2);
+ s3 = vget_low_s16(tt3);
+ s4 = vget_high_s16(tt0);
+ s5 = vget_high_s16(tt1);
+ s6 = vget_high_s16(tt2);
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ src += 7;
+
+ do {
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s7 = vget_low_s16(tt0);
+ s8 = vget_low_s16(tt1);
+ s9 = vget_low_s16(tt2);
+ s10 = vget_low_s16(tt3);
+
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+ transpose_u8_4x4(&d01, &d23);
+
+ vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
+ vreinterpret_u32_u8(d01), 0);
+ vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
+ vreinterpret_u32_u8(d23), 0);
+ vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
+ vreinterpret_u32_u8(d01), 1);
+ vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
+ vreinterpret_u32_u8(d23), 1);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4;
+ dst += 4;
+ w -= 4;
+ } while (w != 0);
+ } else {
+ int width;
+ const uint8_t *s;
+ uint8x8_t t4, t5, t6, t7;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+ if (w == 4) {
+ do {
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
+ src += 8 * src_stride;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+ transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
+ dst += dst_stride;
+ h -= 8;
+ } while (h > 0);
+ } else {
+ uint8_t *d;
+ int16x8_t s11, s12, s13, s14;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ width = w;
+ s = src + 7;
+ d = dst;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+ }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+ uint8x8_t t0, t1, t2, t3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (h == 4) {
+ uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ int16x8_t tt0, tt1, tt2, tt3;
+ uint32x4_t d0123 = vdupq_n_u32(0);
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s0 = vget_low_s16(tt0);
+ s1 = vget_low_s16(tt1);
+ s2 = vget_low_s16(tt2);
+ s3 = vget_low_s16(tt3);
+ s4 = vget_high_s16(tt0);
+ s5 = vget_high_s16(tt1);
+ s6 = vget_high_s16(tt2);
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ src += 7;
+
+ do {
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s7 = vget_low_s16(tt0);
+ s8 = vget_low_s16(tt1);
+ s9 = vget_low_s16(tt2);
+ s10 = vget_low_s16(tt3);
+
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+ transpose_u8_4x4(&d01, &d23);
+
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+ d0123 = vreinterpretq_u32_u8(
+ vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+
+ vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
+ vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
+ vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
+ vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4;
+ dst += 4;
+ w -= 4;
+ } while (w != 0);
+ } else {
+ int width;
+ const uint8_t *s;
+ uint8x8_t t4, t5, t6, t7;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+ if (w == 4) {
+ uint32x4_t d0415 = vdupq_n_u32(0);
+ uint32x4_t d2637 = vdupq_n_u32(0);
+ do {
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
+ src += 8 * src_stride;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+ transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
+ d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
+ d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
+ d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
+ d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
+ d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
+ d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
+ d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
+ d0415 = vreinterpretq_u32_u8(
+ vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
+ d2637 = vreinterpretq_u32_u8(
+ vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
+
+ vst1q_lane_u32((uint32_t *)dst, d0415, 0);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0415, 2);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d2637, 0);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d2637, 2);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0415, 1);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0415, 3);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d2637, 1);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d2637, 3);
+ dst += dst_stride;
+ h -= 8;
+ } while (h > 0);
+ } else {
+ uint8_t *d;
+ int16x8_t s11, s12, s13, s14;
+ uint8x16_t d01, d23, d45, d67;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ width = w;
+ s = src + 7;
+ d = dst;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
+ vld1_u8(d + 1 * dst_stride));
+ d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
+ vld1_u8(d + 3 * dst_stride));
+ d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
+ vld1_u8(d + 5 * dst_stride));
+ d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
+ vld1_u8(d + 7 * dst_stride));
+ d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
+ d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
+ d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
+ d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
+
+ store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
+ vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
+ vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+ }
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+ s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+
+ do {
+ s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+ dst += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t t0, t1, t2, t3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ s = src;
+ s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ d = dst;
+ height = h;
+
+ do {
+ s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ vst1_u8(d, t0);
+ d += dst_stride;
+ vst1_u8(d, t1);
+ d += dst_stride;
+ vst1_u8(d, t2);
+ d += dst_stride;
+ vst1_u8(d, t3);
+ d += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ uint32x4_t d0123 = vdupq_n_u32(0);
+
+ s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+
+ do {
+ s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+ d0123 = vreinterpretq_u32_u8(
+ vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+
+ vst1q_lane_u32((uint32_t *)dst, d0123, 0);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0123, 1);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0123, 2);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0123, 3);
+ dst += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t t0, t1, t2, t3;
+ uint8x16_t d01, d23, dd01, dd23;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ s = src;
+ s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ d = dst;
+ height = h;
+
+ do {
+ s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vcombine_u8(t0, t1);
+ d23 = vcombine_u8(t2, t3);
+ dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
+ vld1_u8(d + 1 * dst_stride));
+ dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
+ vld1_u8(d + 3 * dst_stride));
+ dd01 = vrhaddq_u8(dd01, d01);
+ dd23 = vrhaddq_u8(dd23, d23);
+
+ vst1_u8(d, vget_low_u8(dd01));
+ d += dst_stride;
+ vst1_u8(d, vget_high_u8(dd01));
+ d += dst_stride;
+ vst1_u8(d, vget_low_u8(dd23));
+ d += dst_stride;
+ vst1_u8(d, vget_high_u8(dd23));
+ d += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+#endif // #if VPX_ARCH_AARCH64 &&
+ // (defined(__ARM_FEATURE_DOTPROD) ||
+ // defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
new file mode 100644
index 0000000000..07cf8242d3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
+#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+ const int8x16_t samples_hi,
+ const int32x4_t correction,
+ const int8x8_t filters) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
+ sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x2_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[2];
+ int32x4_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+ sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+ const int8x16_t samples0_hi,
+ const int8x16_t samples1_lo,
+ const int8x16_t samples1_hi,
+ const int32x4_t correction,
+ const int8x8_t filters) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0);
+ sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0);
+ sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+ sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+ sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
+
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+ const uint8x16_t samples_hi,
+ const int8x8_t filters) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum;
+
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+ sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ uint8x16_t permuted_samples[2];
+ int32x4_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+ const uint8x16_t samples0_hi,
+ const uint8x16_t samples1_lo,
+ const uint8x16_t samples1_hi,
+ const int8x8_t filters) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x3_t permute_tbl) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filters) {
+ const int16x4_t filters_lo = vget_low_s16(filters);
+ const int16x4_t filters_hi = vget_high_s16(filters);
+ int16x4_t sum;
+
+ sum = vmul_lane_s16(s0, filters_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filters_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filters_lo, 2);
+ sum = vmla_lane_s16(sum, s5, filters_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filters_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filters_hi, 3);
+ sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3));
+ sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0));
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filters) {
+ const int16x4_t filters_lo = vget_low_s16(filters);
+ const int16x4_t filters_hi = vget_high_s16(filters);
+ int16x8_t sum;
+
+ sum = vmulq_lane_s16(s0, filters_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
+ sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+ const int16x8_t filters) {
+ int16x8_t ss[8];
+
+ ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+ ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+ ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+ ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+ ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+ ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+ ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+ ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+ return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
+ filters);
+}
+
+#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
new file mode 100644
index 0000000000..c4177c5385
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon_asm.h"
+
+/* Type1 and Type2 functions are called depending on the position of the
+ * negative and positive coefficients in the filter. In type1, the filter kernel
+ * used is sub_pel_filters_8lp, in which only the first two and the last two
+ * coefficients are negative. In type2, the negative coefficients are 0, 2, 5 &
+ * 7.
+ */
+
+#define DEFINE_FILTER(dir) \
+ void vpx_convolve8_##dir##_neon( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
+ if (filter == vp9_filter_kernels[1]) { \
+ vpx_convolve8_##dir##_filter_type1_neon( \
+ src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h); \
+ } else { \
+ vpx_convolve8_##dir##_filter_type2_neon( \
+ src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h); \
+ } \
+ }
+
+DEFINE_FILTER(horiz)
+DEFINE_FILTER(avg_horiz)
+DEFINE_FILTER(vert)
+DEFINE_FILTER(avg_vert)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
new file mode 100644
index 0000000000..f1c7d62ed0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+
+#define DECLARE_FILTER(dir, type) \
+ void vpx_convolve8_##dir##_filter_##type##_neon( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+DECLARE_FILTER(horiz, type1)
+DECLARE_FILTER(avg_horiz, type1)
+DECLARE_FILTER(horiz, type2)
+DECLARE_FILTER(avg_horiz, type2)
+DECLARE_FILTER(vert, type1)
+DECLARE_FILTER(avg_vert, type1)
+DECLARE_FILTER(vert, type2)
+DECLARE_FILTER(avg_vert, type2)
+
+#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
new file mode 100644
index 0000000000..2666d4253e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
@@ -0,0 +1,457 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_vert_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r3, r3, r2
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ vmlal.u8 q6, d7, d27
+ vmlsl.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlsl.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ addle r1, r1, r9
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlsl.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ add r10, r10, r2 ; 11*strd
+ vmlal.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlal.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ vmlsl.u8 q6, d16, d28
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlsl.u8 q6, d17, d29
+ add r10, r10, r2 ;12*strd
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ subs r7, r7, #4
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vst1.8 {d14}, [r14], r6
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vmlal.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlal.u8 q6, d7, d27
+ vmlsl.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlal.u8 q7, d16, d27
+ vmlsl.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from
+ ; sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlsl.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlsl.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlal.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlal.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlsl.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
new file mode 100644
index 0000000000..cb5d6d3fe5
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
@@ -0,0 +1,455 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_vert_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r3, r3, r2
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ vmlsl.u8 q6, d7, d27
+ vmlal.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlal.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ addle r1, r1, r9
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlal.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ add r10, r10, r2 ; 11*strd
+ vmlsl.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlsl.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ vmlal.u8 q6, d16, d28
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlsl.u8 q6, d17, d29
+ add r10, r10, r2 ;12*strd
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ subs r7, r7, #4
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vst1.8 {d14}, [r14], r6
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vmlsl.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlsl.u8 q6, d7, d27
+ vmlal.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlsl.u8 q7, d16, d27
+ vmlal.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlal.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlal.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlsl.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlsl.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlal.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
new file mode 100644
index 0000000000..8e3ee599f4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ if (w < 8) { // avg4
+ uint8x8_t s0, s1;
+ uint8x8_t dd0 = vdup_n_u8(0);
+ uint32x2x2_t s01;
+ do {
+ s0 = vld1_u8(src);
+ src += src_stride;
+ s1 = vld1_u8(src);
+ src += src_stride;
+ s01 = vzip_u32(vreinterpret_u32_u8(s0), vreinterpret_u32_u8(s1));
+ dd0 = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)dst, vreinterpret_u32_u8(dd0), 0));
+ dd0 = vreinterpret_u8_u32(vld1_lane_u32(
+ (const uint32_t *)(dst + dst_stride), vreinterpret_u32_u8(dd0), 1));
+ dd0 = vrhadd_u8(vreinterpret_u8_u32(s01.val[0]), dd0);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) { // avg8
+ uint8x8_t s0, s1, d0, d1;
+ uint8x16_t s01, d01;
+ do {
+ s0 = vld1_u8(src);
+ src += src_stride;
+ s1 = vld1_u8(src);
+ src += src_stride;
+ d0 = vld1_u8(dst);
+ d1 = vld1_u8(dst + dst_stride);
+
+ s01 = vcombine_u8(s0, s1);
+ d01 = vcombine_u8(d0, d1);
+ d01 = vrhaddq_u8(s01, d01);
+
+ vst1_u8(dst, vget_low_u8(d01));
+ dst += dst_stride;
+ vst1_u8(dst, vget_high_u8(d01));
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w < 32) { // avg16
+ uint8x16_t s0, s1, d0, d1;
+ do {
+ s0 = vld1q_u8(src);
+ src += src_stride;
+ s1 = vld1q_u8(src);
+ src += src_stride;
+ d0 = vld1q_u8(dst);
+ d1 = vld1q_u8(dst + dst_stride);
+
+ d0 = vrhaddq_u8(s0, d0);
+ d1 = vrhaddq_u8(s1, d1);
+
+ vst1q_u8(dst, d0);
+ dst += dst_stride;
+ vst1q_u8(dst, d1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 32) { // avg32
+ uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
+ do {
+ s0 = vld1q_u8(src);
+ s1 = vld1q_u8(src + 16);
+ src += src_stride;
+ s2 = vld1q_u8(src);
+ s3 = vld1q_u8(src + 16);
+ src += src_stride;
+ d0 = vld1q_u8(dst);
+ d1 = vld1q_u8(dst + 16);
+ d2 = vld1q_u8(dst + dst_stride);
+ d3 = vld1q_u8(dst + dst_stride + 16);
+
+ d0 = vrhaddq_u8(s0, d0);
+ d1 = vrhaddq_u8(s1, d1);
+ d2 = vrhaddq_u8(s2, d2);
+ d3 = vrhaddq_u8(s3, d3);
+
+ vst1q_u8(dst, d0);
+ vst1q_u8(dst + 16, d1);
+ dst += dst_stride;
+ vst1q_u8(dst, d2);
+ vst1q_u8(dst + 16, d3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else { // avg64
+ uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
+ do {
+ s0 = vld1q_u8(src);
+ s1 = vld1q_u8(src + 16);
+ s2 = vld1q_u8(src + 32);
+ s3 = vld1q_u8(src + 48);
+ src += src_stride;
+ d0 = vld1q_u8(dst);
+ d1 = vld1q_u8(dst + 16);
+ d2 = vld1q_u8(dst + 32);
+ d3 = vld1q_u8(dst + 48);
+
+ d0 = vrhaddq_u8(s0, d0);
+ d1 = vrhaddq_u8(s1, d1);
+ d2 = vrhaddq_u8(s2, d2);
+ d3 = vrhaddq_u8(s3, d3);
+
+ vst1q_u8(dst, d0);
+ vst1q_u8(dst + 16, d1);
+ vst1q_u8(dst + 32, d2);
+ vst1q_u8(dst + 48, d3);
+ dst += dst_stride;
+ } while (--h);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
new file mode 100644
index 0000000000..efd6574f1f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
@@ -0,0 +1,116 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_convolve_avg_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve_avg_neon| PROC
+ push {r4-r6, lr}
+ ldrd r4, r5, [sp, #36]
+ mov r6, r2
+
+ cmp r4, #32
+ bgt avg64
+ beq avg32
+ cmp r4, #8
+ bgt avg16
+ beq avg8
+ b avg4
+
+avg64
+ sub lr, r1, #32
+ sub r4, r3, #32
+avg64_h
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0]!
+ vld1.8 {q2-q3}, [r0], lr
+ pld [r2, r3]
+ vld1.8 {q8-q9}, [r6@128]!
+ vld1.8 {q10-q11}, [r6@128], r4
+ vrhadd.u8 q0, q0, q8
+ vrhadd.u8 q1, q1, q9
+ vrhadd.u8 q2, q2, q10
+ vrhadd.u8 q3, q3, q11
+ vst1.8 {q0-q1}, [r2@128]!
+ vst1.8 {q2-q3}, [r2@128], r4
+ subs r5, r5, #1
+ bgt avg64_h
+ pop {r4-r6, pc}
+
+avg32
+ vld1.8 {q0-q1}, [r0], r1
+ vld1.8 {q2-q3}, [r0], r1
+ vld1.8 {q8-q9}, [r6@128], r3
+ vld1.8 {q10-q11}, [r6@128], r3
+ pld [r0]
+ vrhadd.u8 q0, q0, q8
+ pld [r0, r1]
+ vrhadd.u8 q1, q1, q9
+ pld [r6]
+ vrhadd.u8 q2, q2, q10
+ pld [r6, r3]
+ vrhadd.u8 q3, q3, q11
+ vst1.8 {q0-q1}, [r2@128], r3
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #2
+ bgt avg32
+ pop {r4-r6, pc}
+
+avg16
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q2}, [r6@128], r3
+ vld1.8 {q3}, [r6@128], r3
+ pld [r0]
+ pld [r0, r1]
+ vrhadd.u8 q0, q0, q2
+ pld [r6]
+ pld [r6, r3]
+ vrhadd.u8 q1, q1, q3
+ vst1.8 {q0}, [r2@128], r3
+ vst1.8 {q1}, [r2@128], r3
+ subs r5, r5, #2
+ bgt avg16
+ pop {r4-r6, pc}
+
+avg8
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d2}, [r6@64], r3
+ vld1.8 {d3}, [r6@64], r3
+ pld [r0]
+ pld [r0, r1]
+ vrhadd.u8 q0, q0, q1
+ pld [r6]
+ pld [r6, r3]
+ vst1.8 {d0}, [r2@64], r3
+ vst1.8 {d1}, [r2@64], r3
+ subs r5, r5, #2
+ bgt avg8
+ pop {r4-r6, pc}
+
+avg4
+ vld1.32 {d0[0]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d2[0]}, [r6@32], r3
+ vld1.32 {d2[1]}, [r6@32], r3
+ vrhadd.u8 d0, d0, d2
+ vst1.32 {d0[0]}, [r2@32], r3
+ vst1.32 {d0[1]}, [r2@32], r3
+ subs r5, r5, #2
+ bgt avg4
+ pop {r4-r6, pc}
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
new file mode 100644
index 0000000000..bea7c98437
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <string.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ if (w < 8) { // copy4
+ do {
+ memcpy(dst, src, 4);
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 4);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) { // copy8
+ uint8x8_t s0, s1;
+ do {
+ s0 = vld1_u8(src);
+ src += src_stride;
+ s1 = vld1_u8(src);
+ src += src_stride;
+
+ vst1_u8(dst, s0);
+ dst += dst_stride;
+ vst1_u8(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w < 32) { // copy16
+ uint8x16_t s0, s1;
+ do {
+ s0 = vld1q_u8(src);
+ src += src_stride;
+ s1 = vld1q_u8(src);
+ src += src_stride;
+
+ vst1q_u8(dst, s0);
+ dst += dst_stride;
+ vst1q_u8(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 32) { // copy32
+ uint8x16_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u8(src);
+ s1 = vld1q_u8(src + 16);
+ src += src_stride;
+ s2 = vld1q_u8(src);
+ s3 = vld1q_u8(src + 16);
+ src += src_stride;
+
+ vst1q_u8(dst, s0);
+ vst1q_u8(dst + 16, s1);
+ dst += dst_stride;
+ vst1q_u8(dst, s2);
+ vst1q_u8(dst + 16, s3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else { // copy64
+ uint8x16_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u8(src);
+ s1 = vld1q_u8(src + 16);
+ s2 = vld1q_u8(src + 32);
+ s3 = vld1q_u8(src + 48);
+ src += src_stride;
+
+ vst1q_u8(dst, s0);
+ vst1q_u8(dst + 16, s1);
+ vst1q_u8(dst + 32, s2);
+ vst1q_u8(dst + 48, s3);
+ dst += dst_stride;
+ } while (--h);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
new file mode 100644
index 0000000000..7a66e3ce2f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
@@ -0,0 +1,84 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_convolve_copy_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve_copy_neon| PROC
+ push {r4-r5, lr}
+ ldrd r4, r5, [sp, #32]
+
+ cmp r4, #32
+ bgt copy64
+ beq copy32
+ cmp r4, #8
+ bgt copy16
+ beq copy8
+ b copy4
+
+copy64
+ sub lr, r1, #32
+ sub r3, r3, #32
+copy64_h
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0]!
+ vld1.8 {q2-q3}, [r0], lr
+ vst1.8 {q0-q1}, [r2@128]!
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #1
+ bgt copy64_h
+ pop {r4-r5, pc}
+
+copy32
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {q2-q3}, [r0], r1
+ vst1.8 {q0-q1}, [r2@128], r3
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #2
+ bgt copy32
+ pop {r4-r5, pc}
+
+copy16
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {q1}, [r0], r1
+ vst1.8 {q0}, [r2@128], r3
+ vst1.8 {q1}, [r2@128], r3
+ subs r5, r5, #2
+ bgt copy16
+ pop {r4-r5, pc}
+
+copy8
+ pld [r0, r1, lsl #1]
+ vld1.8 {d0}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {d2}, [r0], r1
+ vst1.8 {d0}, [r2@64], r3
+ vst1.8 {d2}, [r2@64], r3
+ subs r5, r5, #2
+ bgt copy8
+ pop {r4-r5, pc}
+
+copy4
+ ldr r12, [r0], r1
+ str r12, [r2], r3
+ subs r5, r5, #1
+ bgt copy4
+ pop {r4-r5, pc}
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
new file mode 100644
index 0000000000..830f3176d7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+ * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+ */
+ uint8_t temp[64 * 72];
+
+ // Account for the vertical phase needing 3 lines prior and 4 lines post
+ // (+ 1 to make it divisible by 4).
+ const int intermediate_height = h + 8;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ /* Filter starting 3 lines back. The neon implementation will ignore the given
+ * height and filter a multiple of 4 lines. Since this goes in to the temp
+ * buffer which has lots of extra room and is subsequently discarded this is
+ * safe if somewhat less than ideal. */
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height);
+
+ /* Step into the temp buffer 3 lines to get the actual frame data */
+ vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ uint8_t temp[64 * 72];
+ const int intermediate_height = h + 8;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ /* This implementation has the same issues as above. In addition, we only want
+ * to average the values after both passes.
+ */
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height);
+ vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
new file mode 100644
index 0000000000..b8e3c5e540
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void scaledconvolve_horiz_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4, const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int x, y, z;
+
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ y = h;
+ do {
+ int x_q4 = x0_q4;
+ x = 0;
+ do {
+ // process 4 src_x steps
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ if (x_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ int16x8_t ss[4];
+ int16x4_t t[8], tt;
+
+ load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
+ transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+
+ ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+ ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+ ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+ ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+ t[0] = vget_low_s16(ss[0]);
+ t[1] = vget_low_s16(ss[1]);
+ t[2] = vget_low_s16(ss[2]);
+ t[3] = vget_low_s16(ss[3]);
+ t[4] = vget_high_s16(ss[0]);
+ t[5] = vget_high_s16(ss[1]);
+ t[6] = vget_high_s16(ss[2]);
+ t[7] = vget_high_s16(ss[3]);
+
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
+ filters);
+ d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+ vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
+ } else {
+ int i;
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 4x4 filters values back to dst
+ {
+ const uint8x8x4_t d4 = vld4_u8(temp);
+ vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
+ vreinterpret_u32_u8(d4.val[0]), 0);
+ vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
+ vreinterpret_u32_u8(d4.val[1]), 0);
+ vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
+ vreinterpret_u32_u8(d4.val[2]), 0);
+ vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
+ vreinterpret_u32_u8(d4.val[3]), 0);
+ }
+ x += 4;
+ } while (x < w);
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ y -= 4;
+ } while (y > 0);
+}
+
+static INLINE void scaledconvolve_horiz_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4, const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = (h + 7) & ~7;
+
+ do {
+ int x_q4 = x0_q4;
+ x = 0;
+ do {
+ uint8x8_t d[8];
+ // process 8 src_x steps
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+
+ if (x_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8];
+ load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
+ &s[5], &s[6], &s[7]);
+ transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ d[0] = scale_filter_8(s, filters);
+ vst1_u8(&temp[8 * z], d[0]);
+ } else {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 8x8 filters values back to dst
+ load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+ &d[7]);
+ transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ vst1_u8(&dst[x + 0 * dst_stride], d[0]);
+ vst1_u8(&dst[x + 1 * dst_stride], d[1]);
+ vst1_u8(&dst[x + 2 * dst_stride], d[2]);
+ vst1_u8(&dst[x + 3 * dst_stride], d[3]);
+ vst1_u8(&dst[x + 4 * dst_stride], d[4]);
+ vst1_u8(&dst[x + 5 * dst_stride], d[5]);
+ vst1_u8(&dst[x + 6 * dst_stride], d[6]);
+ vst1_u8(&dst[x + 7 * dst_stride], d[7]);
+ x += 8;
+ } while (x < w);
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static INLINE void scaledconvolve_vert_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ int16x4_t t[8], tt;
+
+ load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
+ t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
+ t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
+ t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
+ t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
+ t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
+ t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
+ t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
+
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
+ d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ d = scale_filter_8(s, filters);
+ vst1_u8(dst, d);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w16(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int x, y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ if (y_q4 & SUBPEL_MASK) {
+ x = 0;
+ do {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x16_t ss[8];
+ uint8x8_t s[8], d[2];
+ load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
+ &ss[5], &ss[6], &ss[7]);
+ s[0] = vget_low_u8(ss[0]);
+ s[1] = vget_low_u8(ss[1]);
+ s[2] = vget_low_u8(ss[2]);
+ s[3] = vget_low_u8(ss[3]);
+ s[4] = vget_low_u8(ss[4]);
+ s[5] = vget_low_u8(ss[5]);
+ s[6] = vget_low_u8(ss[6]);
+ s[7] = vget_low_u8(ss[7]);
+ d[0] = scale_filter_8(s, filters);
+
+ s[0] = vget_high_u8(ss[0]);
+ s[1] = vget_high_u8(ss[1]);
+ s[2] = vget_high_u8(ss[2]);
+ s[3] = vget_high_u8(ss[3]);
+ s[4] = vget_high_u8(ss[4]);
+ s[5] = vget_high_u8(ss[5]);
+ s[6] = vget_high_u8(ss[6]);
+ s[7] = vget_high_u8(ss[7]);
+ d[1] = scale_filter_8(s, filters);
+ vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
+ src_y += 16;
+ x += 16;
+ } while (x < w);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if (w >= 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else {
+ scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/avg.c b/media/libvpx/libvpx/vpx_dsp/avg.c
new file mode 100644
index 0000000000..391e9eb144
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/avg.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
+ int i, j;
+ int sum = 0;
+ for (i = 0; i < 8; ++i, s += p)
+ for (j = 0; j < 8; sum += s[j], ++j) {
+ }
+
+ return (sum + 32) >> 6;
+}
+
+unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
+ int i, j;
+ int sum = 0;
+ for (i = 0; i < 4; ++i, s += p)
+ for (j = 0; j < 4; sum += s[j], ++j) {
+ }
+
+ return (sum + 8) >> 4;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// src_diff: 13 bit, dynamic range [-4095, 4095]
+// coeff: 16 bit
+static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,
+ ptrdiff_t src_stride,
+ int16_t *coeff) {
+ int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+ int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+ int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+ int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+ int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+ int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+ int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+ int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+ int16_t c0 = b0 + b2;
+ int16_t c1 = b1 + b3;
+ int16_t c2 = b0 - b2;
+ int16_t c3 = b1 - b3;
+ int16_t c4 = b4 + b6;
+ int16_t c5 = b5 + b7;
+ int16_t c6 = b4 - b6;
+ int16_t c7 = b5 - b7;
+
+ coeff[0] = c0 + c4;
+ coeff[7] = c1 + c5;
+ coeff[3] = c2 + c6;
+ coeff[4] = c3 + c7;
+ coeff[2] = c0 - c4;
+ coeff[6] = c1 - c5;
+ coeff[1] = c2 - c6;
+ coeff[5] = c3 - c7;
+}
+
+// src_diff: 16 bit, dynamic range [-32760, 32760]
+// coeff: 19 bit
+static void hadamard_highbd_col8_second_pass(const int16_t *src_diff,
+ ptrdiff_t src_stride,
+ int32_t *coeff) {
+ int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+ int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+ int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+ int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+ int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+ int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+ int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+ int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+ int32_t c0 = b0 + b2;
+ int32_t c1 = b1 + b3;
+ int32_t c2 = b0 - b2;
+ int32_t c3 = b1 - b3;
+ int32_t c4 = b4 + b6;
+ int32_t c5 = b5 + b7;
+ int32_t c6 = b4 - b6;
+ int32_t c7 = b5 - b7;
+
+ coeff[0] = c0 + c4;
+ coeff[7] = c1 + c5;
+ coeff[3] = c2 + c6;
+ coeff[4] = c3 + c7;
+ coeff[2] = c0 - c4;
+ coeff[6] = c1 - c5;
+ coeff[1] = c2 - c6;
+ coeff[5] = c3 - c7;
+}
+
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ int16_t buffer[64];
+ int32_t buffer2[64];
+ int16_t *tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ // src_diff: 13 bit
+ // buffer: 16 bit, dynamic range [-32760, 32760]
+ hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf);
+ tmp_buf += 8;
+ ++src_diff;
+ }
+
+ tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ // buffer: 16 bit
+ // buffer2: 19 bit, dynamic range [-262080, 262080]
+ hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx);
+ ++tmp_buf;
+ }
+
+ for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+}
+
+// In place 16x16 2D Hadamard transform
+void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 13 bit, dynamic range [-4095, 4095]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ vpx_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ // coeff: 19 bit, dynamic range [-262080, 262080]
+ for (idx = 0; idx < 64; ++idx) {
+ tran_low_t a0 = coeff[0];
+ tran_low_t a1 = coeff[64];
+ tran_low_t a2 = coeff[128];
+ tran_low_t a3 = coeff[192];
+
+ tran_low_t b0 = (a0 + a1) >> 1;
+ tran_low_t b1 = (a0 - a1) >> 1;
+ tran_low_t b2 = (a2 + a3) >> 1;
+ tran_low_t b3 = (a2 - a3) >> 1;
+
+ // new coeff dynamic range: 20 bit
+ coeff[0] = b0 + b2;
+ coeff[64] = b1 + b3;
+ coeff[128] = b0 - b2;
+ coeff[192] = b1 - b3;
+
+ ++coeff;
+ }
+}
+
+void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 13 bit, dynamic range [-4095, 4095]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ vpx_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+ }
+
+ // coeff: 20 bit
+ for (idx = 0; idx < 256; ++idx) {
+ tran_low_t a0 = coeff[0];
+ tran_low_t a1 = coeff[256];
+ tran_low_t a2 = coeff[512];
+ tran_low_t a3 = coeff[768];
+
+ tran_low_t b0 = (a0 + a1) >> 2;
+ tran_low_t b1 = (a0 - a1) >> 2;
+ tran_low_t b2 = (a2 + a3) >> 2;
+ tran_low_t b3 = (a2 - a3) >> 2;
+
+ // new coeff dynamic range: 20 bit
+ coeff[0] = b0 + b2;
+ coeff[256] = b1 + b3;
+ coeff[512] = b0 - b2;
+ coeff[768] = b1 - b3;
+
+ ++coeff;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+// src_diff: first pass, 9 bit, dynamic range [-255, 255]
+// second pass, 12 bit, dynamic range [-2040, 2040]
+static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+ int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+ int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+ int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+ int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+ int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+ int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+ int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+ int16_t c0 = b0 + b2;
+ int16_t c1 = b1 + b3;
+ int16_t c2 = b0 - b2;
+ int16_t c3 = b1 - b3;
+ int16_t c4 = b4 + b6;
+ int16_t c5 = b5 + b7;
+ int16_t c6 = b4 - b6;
+ int16_t c7 = b5 - b7;
+
+ coeff[0] = c0 + c4;
+ coeff[7] = c1 + c5;
+ coeff[3] = c2 + c6;
+ coeff[4] = c3 + c7;
+ coeff[2] = c0 - c4;
+ coeff[6] = c1 - c5;
+ coeff[1] = c2 - c6;
+ coeff[5] = c3 - c7;
+}
+
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ int16_t buffer[64];
+ int16_t buffer2[64];
+ int16_t *tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
+ // dynamic range [-255, 255]
+ tmp_buf += 8;
+ ++src_diff;
+ }
+
+ tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit
+ // dynamic range [-2040, 2040]
+ // buffer2: 15 bit
+ // dynamic range [-16320, 16320]
+ ++tmp_buf;
+ }
+
+ for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+}
+
+// In place 16x16 2D Hadamard transform
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ // coeff: 15 bit, dynamic range [-16320, 16320]
+ for (idx = 0; idx < 64; ++idx) {
+ tran_low_t a0 = coeff[0];
+ tran_low_t a1 = coeff[64];
+ tran_low_t a2 = coeff[128];
+ tran_low_t a3 = coeff[192];
+
+ tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
+ tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
+ tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
+ tran_low_t b3 = (a2 - a3) >> 1;
+
+ coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
+ coeff[64] = b1 + b3;
+ coeff[128] = b0 - b2;
+ coeff[192] = b1 - b3;
+
+ ++coeff;
+ }
+}
+
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+ }
+
+ // coeff: 15 bit, dynamic range [-16320, 16320]
+ for (idx = 0; idx < 256; ++idx) {
+ tran_low_t a0 = coeff[0];
+ tran_low_t a1 = coeff[256];
+ tran_low_t a2 = coeff[512];
+ tran_low_t a3 = coeff[768];
+
+ tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 16 bit, [-32640, 32640]
+ tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range
+ tran_low_t b2 = (a2 + a3) >> 2; // [-16320, 16320]
+ tran_low_t b3 = (a2 - a3) >> 2;
+
+ coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
+ coeff[256] = b1 + b3;
+ coeff[512] = b0 - b2;
+ coeff[768] = b1 - b3;
+
+ ++coeff;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// coeff: dynamic range 20 bit.
+// length: value range {16, 64, 256, 1024}.
+int vpx_highbd_satd_c(const tran_low_t *coeff, int length) {
+ int i;
+ int satd = 0;
+ for (i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+ // satd: 30 bits
+ return satd;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int vpx_satd_c(const tran_low_t *coeff, int length) {
+ int i;
+ int satd = 0;
+ for (i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+ // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+ return satd;
+}
+
+// Integer projection onto row vectors.
+// height: value range {16, 32, 64}.
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
+ const int ref_stride, const int height) {
+ int idx;
+ const int norm_factor = height >> 1;
+ assert(height >= 2);
+ for (idx = 0; idx < 16; ++idx) {
+ int i;
+ hbuf[idx] = 0;
+ // hbuf[idx]: 14 bit, dynamic range [0, 16320].
+ for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
+ // hbuf[idx]: 9 bit, dynamic range [0, 510].
+ hbuf[idx] /= norm_factor;
+ ++ref;
+ }
+}
+
+// width: value range {16, 32, 64}.
+int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width) {
+ int idx;
+ int16_t sum = 0;
+ // sum: 14 bit, dynamic range [0, 16320]
+ for (idx = 0; idx < width; ++idx) sum += ref[idx];
+ return sum;
+}
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4}
+int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
+ int i;
+ int width = 4 << bwl;
+ int sse = 0, mean = 0, var;
+
+ for (i = 0; i < width; ++i) {
+ int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits.
+ mean += diff; // mean: dynamic range 16 bits.
+ sse += diff * diff; // sse: dynamic range 26 bits.
+ }
+
+ // (mean * mean): dynamic range 31 bits.
+ var = sse - ((mean * mean) >> (bwl + 2));
+ return var;
+}
+
+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ int i, j;
+ *min = 255;
+ *max = 0;
+ for (i = 0; i < 8; ++i, s += p, d += dp) {
+ for (j = 0; j < 8; ++j) {
+ int diff = abs(s[j] - d[j]);
+ *min = diff < *min ? diff : *min;
+ *max = diff > *max ? diff : *max;
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+ int i, j;
+ int sum = 0;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ for (i = 0; i < 8; ++i, s += p)
+ for (j = 0; j < 8; sum += s[j], ++j) {
+ }
+
+ return (sum + 32) >> 6;
+}
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+ int i, j;
+ int sum = 0;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ for (i = 0; i < 4; ++i, s += p)
+ for (j = 0; j < 4; sum += s[j], ++j) {
+ }
+
+ return (sum + 8) >> 4;
+}
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+ int dp, int *min, int *max) {
+ int i, j;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
+ *min = 65535;
+ *max = 0;
+ for (i = 0; i < 8; ++i, s += p, d += dp) {
+ for (j = 0; j < 8; ++j) {
+ int diff = abs(s[j] - d[j]);
+ *min = diff < *min ? diff : *min;
+ *max = diff > *max ? diff : *max;
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader.c b/media/libvpx/libvpx/vpx_dsp/bitreader.c
new file mode 100644
index 0000000000..90cbbba53f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/bitreader.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/prob.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/endian_inl.h"
+
+int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size,
+ vpx_decrypt_cb decrypt_cb, void *decrypt_state) {
+ if (size && !buffer) {
+ return 1;
+ } else {
+ r->buffer_end = buffer + size;
+ r->buffer = buffer;
+ r->value = 0;
+ r->count = -8;
+ r->range = 255;
+ r->decrypt_cb = decrypt_cb;
+ r->decrypt_state = decrypt_state;
+ vpx_reader_fill(r);
+ return vpx_read_bit(r) != 0; // marker bit
+ }
+}
+
+void vpx_reader_fill(vpx_reader *r) {
+ const uint8_t *const buffer_end = r->buffer_end;
+ const uint8_t *buffer = r->buffer;
+ const uint8_t *buffer_start = buffer;
+ BD_VALUE value = r->value;
+ int count = r->count;
+ const size_t bytes_left = buffer_end - buffer;
+ const size_t bits_left = bytes_left * CHAR_BIT;
+ int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
+
+ if (r->decrypt_cb) {
+ size_t n = VPXMIN(sizeof(r->clear_buffer), bytes_left);
+ r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n);
+ buffer = r->clear_buffer;
+ buffer_start = r->clear_buffer;
+ }
+ if (bits_left > BD_VALUE_SIZE) {
+ const int bits = (shift & 0xfffffff8) + CHAR_BIT;
+ BD_VALUE nv;
+ BD_VALUE big_endian_values;
+ memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
+#if SIZE_MAX == 0xffffffffffffffffULL
+ big_endian_values = HToBE64(big_endian_values);
+#else
+ big_endian_values = HToBE32(big_endian_values);
+#endif
+ nv = big_endian_values >> (BD_VALUE_SIZE - bits);
+ count += bits;
+ buffer += (bits >> 3);
+ value = r->value | (nv << (shift & 0x7));
+ } else {
+ const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left);
+ int loop_end = 0;
+ if (bits_over >= 0) {
+ count += LOTS_OF_BITS;
+ loop_end = bits_over;
+ }
+
+ if (bits_over < 0 || bits_left) {
+ while (shift >= loop_end) {
+ count += CHAR_BIT;
+ value |= (BD_VALUE)*buffer++ << shift;
+ shift -= CHAR_BIT;
+ }
+ }
+ }
+
+ // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption,
+ // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than
+ // assign 'buffer' to 'r->buffer'.
+ r->buffer += buffer - buffer_start;
+ r->value = value;
+ r->count = count;
+}
+
+const uint8_t *vpx_reader_find_end(vpx_reader *r) {
+ // Find the end of the coded buffer
+ while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) {
+ r->count -= CHAR_BIT;
+ r->buffer--;
+ }
+ return r->buffer;
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader.h b/media/libvpx/libvpx/vpx_dsp/bitreader.h
new file mode 100644
index 0000000000..a5927ea2ad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/bitreader.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_BITREADER_H_
+#define VPX_VPX_DSP_BITREADER_H_
+
+#include <stddef.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/prob.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef size_t BD_VALUE;
+
+#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
+
+// This is meant to be a large, positive constant that can still be efficiently
+// loaded as an immediate (on platforms like ARM, for example).
+// Even relatively modest values like 100 would work fine.
+#define LOTS_OF_BITS 0x40000000
+
+typedef struct {
+ // Be careful when reordering this struct, it may impact the cache negatively.
+ BD_VALUE value;
+ unsigned int range;
+ int count;
+ const uint8_t *buffer_end;
+ const uint8_t *buffer;
+ vpx_decrypt_cb decrypt_cb;
+ void *decrypt_state;
+ uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
+} vpx_reader;
+
+int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size,
+ vpx_decrypt_cb decrypt_cb, void *decrypt_state);
+
+void vpx_reader_fill(vpx_reader *r);
+
+const uint8_t *vpx_reader_find_end(vpx_reader *r);
+
+static INLINE int vpx_reader_has_error(vpx_reader *r) {
+ // Check if we have reached the end of the buffer.
+ //
+ // Variable 'count' stores the number of bits in the 'value' buffer, minus
+ // 8. The top byte is part of the algorithm, and the remainder is buffered
+ // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+ // occupied, 8 for the algorithm and 8 in the buffer.
+ //
+ // When reading a byte from the user's buffer, count is filled with 8 and
+ // one byte is filled into the value buffer. When we reach the end of the
+ // data, count is additionally filled with LOTS_OF_BITS. So when
+ // count == LOTS_OF_BITS - 1, the user's data has been exhausted.
+ //
+ // 1 if we have tried to decode bits after the end of stream was encountered.
+ // 0 No error.
+ return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
+}
+
+static INLINE int vpx_read(vpx_reader *r, int prob) {
+ unsigned int bit = 0;
+ BD_VALUE value;
+ BD_VALUE bigsplit;
+ int count;
+ unsigned int range;
+ unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
+
+ if (r->count < 0) vpx_reader_fill(r);
+
+ value = r->value;
+ count = r->count;
+
+ bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
+
+ range = split;
+
+ if (value >= bigsplit) {
+ range = r->range - split;
+ value = value - bigsplit;
+ bit = 1;
+ }
+
+ {
+ const unsigned char shift = vpx_norm[(unsigned char)range];
+ range <<= shift;
+ value <<= shift;
+ count -= shift;
+ }
+ r->value = value;
+ r->count = count;
+ r->range = range;
+
+#if CONFIG_BITSTREAM_DEBUG
+ {
+ const int queue_r = bitstream_queue_get_read();
+ const int frame_idx = bitstream_queue_get_frame_read();
+ int ref_result, ref_prob;
+ bitstream_queue_pop(&ref_result, &ref_prob);
+ if ((int)bit != ref_result) {
+ fprintf(stderr,
+ "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d "
+ "queue_r %d\n",
+ frame_idx, bit, ref_result, queue_r);
+
+ assert(0);
+ }
+ if (prob != ref_prob) {
+ fprintf(stderr,
+ "\n *** [bit] prob error, frame_idx_r %d prob %d ref_prob %d "
+ "queue_r %d\n",
+ frame_idx, prob, ref_prob, queue_r);
+
+ assert(0);
+ }
+ }
+#endif
+
+ return bit;
+}
+
+static INLINE int vpx_read_bit(vpx_reader *r) {
+ return vpx_read(r, 128); // vpx_prob_half
+}
+
+static INLINE int vpx_read_literal(vpx_reader *r, int bits) {
+ int literal = 0, bit;
+
+ for (bit = bits - 1; bit >= 0; bit--) literal |= vpx_read_bit(r) << bit;
+
+ return literal;
+}
+
+static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
+ const vpx_prob *probs) {
+ vpx_tree_index i = 0;
+
+ while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0) continue;
+
+ return -i;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_BITREADER_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c
new file mode 100644
index 0000000000..f59f1f7cb9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "./bitreader_buffer.h"
+
+size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb) {
+ return (rb->bit_offset + 7) >> 3;
+}
+
+int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
+ const size_t off = rb->bit_offset;
+ const size_t p = off >> 3;
+ const int q = 7 - (int)(off & 0x7);
+ if (rb->bit_buffer + p < rb->bit_buffer_end) {
+ const int bit = (rb->bit_buffer[p] >> q) & 1;
+ rb->bit_offset = off + 1;
+ return bit;
+ } else {
+ if (rb->error_handler != NULL) rb->error_handler(rb->error_handler_data);
+ return 0;
+ }
+}
+
+int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) {
+ int value = 0, bit;
+ for (bit = bits - 1; bit >= 0; bit--) value |= vpx_rb_read_bit(rb) << bit;
+ return value;
+}
+
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
+ const int value = vpx_rb_read_literal(rb, bits);
+ return vpx_rb_read_bit(rb) ? -value : value;
+}
+
+int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
+ return vpx_rb_read_signed_literal(rb, bits);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h
new file mode 100644
index 0000000000..b27703a4db
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_BITREADER_BUFFER_H_
+#define VPX_VPX_DSP_BITREADER_BUFFER_H_
+
+#include <limits.h>
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*vpx_rb_error_handler)(void *data);
+
+struct vpx_read_bit_buffer {
+ const uint8_t *bit_buffer;
+ const uint8_t *bit_buffer_end;
+ size_t bit_offset;
+
+ void *error_handler_data;
+ vpx_rb_error_handler error_handler;
+};
+
+size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb);
+
+int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb);
+
+int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_BITREADER_BUFFER_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter.c b/media/libvpx/libvpx/vpx_dsp/bitwriter.c
new file mode 100644
index 0000000000..5b41aa54dd
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/bitwriter.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./bitwriter.h"
+
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif
+
+void vpx_start_encode(vpx_writer *br, uint8_t *source) {
+ br->lowvalue = 0;
+ br->range = 255;
+ br->count = -24;
+ br->buffer = source;
+ br->pos = 0;
+ vpx_write_bit(br, 0);
+}
+
+void vpx_stop_encode(vpx_writer *br) {
+ int i;
+
+#if CONFIG_BITSTREAM_DEBUG
+ bitstream_queue_set_skip_write(1);
+#endif
+ for (i = 0; i < 32; i++) vpx_write_bit(br, 0);
+
+ // Ensure there's no ambigous collision with any index marker bytes
+ if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
+
+#if CONFIG_BITSTREAM_DEBUG
+ bitstream_queue_set_skip_write(0);
+#endif
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter.h b/media/libvpx/libvpx/vpx_dsp/bitwriter.h
new file mode 100644
index 0000000000..5f1ee69ec2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/bitwriter.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_BITWRITER_H_
+#define VPX_VPX_DSP_BITWRITER_H_
+
+#include <stdio.h>
+
+#include "vpx_ports/compiler_attributes.h"
+#include "vpx_ports/mem.h"
+
+#include "vpx_dsp/prob.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vpx_writer {
+ unsigned int lowvalue;
+ unsigned int range;
+ int count;
+ unsigned int pos;
+ uint8_t *buffer;
+} vpx_writer;
+
+void vpx_start_encode(vpx_writer *br, uint8_t *source);
+void vpx_stop_encode(vpx_writer *br);
+
+static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br,
+ int bit,
+ int probability) {
+ unsigned int split;
+ int count = br->count;
+ unsigned int range = br->range;
+ unsigned int lowvalue = br->lowvalue;
+ int shift;
+
+#if CONFIG_BITSTREAM_DEBUG
+ /*
+ int queue_r = 0;
+ int frame_idx_r = 0;
+ int queue_w = bitstream_queue_get_write();
+ int frame_idx_w = bitstream_queue_get_frame_write();
+ if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+ fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+ frame_idx_w, queue_w);
+ assert(0);
+ }
+ */
+ bitstream_queue_push(bit, probability);
+#endif
+
+ split = 1 + (((range - 1) * probability) >> 8);
+
+ range = split;
+
+ if (bit) {
+ lowvalue += split;
+ range = br->range - split;
+ }
+
+ shift = vpx_norm[range];
+
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0) {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000) {
+ int x = br->pos - 1;
+
+ while (x >= 0 && br->buffer[x] == 0xff) {
+ br->buffer[x] = 0;
+ x--;
+ }
+
+ br->buffer[x] += 1;
+ }
+
+ br->buffer[br->pos++] = (lowvalue >> (24 - offset)) & 0xff;
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8;
+ }
+
+ lowvalue <<= shift;
+ br->count = count;
+ br->lowvalue = lowvalue;
+ br->range = range;
+}
+
+static INLINE void vpx_write_bit(vpx_writer *w, int bit) {
+ vpx_write(w, bit, 128); // vpx_prob_half
+}
+
+static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) {
+ int bit;
+
+ for (bit = bits - 1; bit >= 0; bit--) vpx_write_bit(w, 1 & (data >> bit));
+}
+
+#define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8)
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_BITWRITER_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c
new file mode 100644
index 0000000000..7a7e96f02e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./bitwriter_buffer.h"
+
+size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb) {
+ return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
+ const int off = (int)wb->bit_offset;
+ const int p = off / CHAR_BIT;
+ const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+ if (q == CHAR_BIT - 1) {
+ wb->bit_buffer[p] = bit << q;
+ } else {
+ wb->bit_buffer[p] &= ~(1 << q);
+ wb->bit_buffer[p] |= bit << q;
+ }
+ wb->bit_offset = off + 1;
+}
+
+void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
+ int bit;
+ for (bit = bits - 1; bit >= 0; bit--) vpx_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
+ int bits) {
+ vpx_wb_write_literal(wb, abs(data), bits);
+ vpx_wb_write_bit(wb, data < 0);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h
new file mode 100644
index 0000000000..3662cb64df
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_BITWRITER_BUFFER_H_
+#define VPX_VPX_DSP_BITWRITER_BUFFER_H_
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct vpx_write_bit_buffer {
+ uint8_t *bit_buffer;
+ size_t bit_offset;
+};
+
+size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb);
+
+void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit);
+
+void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits);
+
+void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
+ int bits);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_BITWRITER_BUFFER_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/deblock.c b/media/libvpx/libvpx/vpx_dsp/deblock.c
new file mode 100644
index 0000000000..455b73bbce
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/deblock.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+const int16_t vpx_rv[] = {
+ 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, 14,
+ 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0,
+ 3, 14, 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8,
+ 2, 9, 7, 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3,
+ 2, 0, 13, 13, 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, 4, 14, 4, 10, 0,
+ 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, 0, 10, 0, 5,
+ 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3, 4, 7,
+ 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1,
+ 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9,
+ 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+ 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2,
+ 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3, 10, 5, 8, 0, 11, 6,
+ 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10, 8, 9,
+ 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2,
+ 7, 2, 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3,
+ 0, 11, 8, 13, 1, 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7,
+ 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0,
+ 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12,
+ 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0,
+ 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+ 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12,
+ 3, 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6,
+ 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, 0, 9, 5, 5, 11, 10, 13,
+ 9, 10, 13,
+};
+
+void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src,
+ unsigned char *dst, int src_pitch,
+ int dst_pitch, int cols,
+ unsigned char *flimits, int size) {
+ unsigned char *p_src, *p_dst;
+ int row;
+ int col;
+ unsigned char v;
+ unsigned char d[4];
+
+ assert(size >= 8);
+ assert(cols >= 8);
+
+ for (row = 0; row < size; row++) {
+ /* post_proc_down for one row */
+ p_src = src;
+ p_dst = dst;
+
+ for (col = 0; col < cols; col++) {
+ unsigned char p_above2 = p_src[col - 2 * src_pitch];
+ unsigned char p_above1 = p_src[col - src_pitch];
+ unsigned char p_below1 = p_src[col + src_pitch];
+ unsigned char p_below2 = p_src[col + 2 * src_pitch];
+
+ v = p_src[col];
+
+ if ((abs(v - p_above2) < flimits[col]) &&
+ (abs(v - p_above1) < flimits[col]) &&
+ (abs(v - p_below1) < flimits[col]) &&
+ (abs(v - p_below2) < flimits[col])) {
+ unsigned char k1, k2, k3;
+ k1 = (p_above2 + p_above1 + 1) >> 1;
+ k2 = (p_below2 + p_below1 + 1) >> 1;
+ k3 = (k1 + k2 + 1) >> 1;
+ v = (k3 + v + 1) >> 1;
+ }
+
+ p_dst[col] = v;
+ }
+
+ /* now post_proc_across */
+ p_src = dst;
+ p_dst = dst;
+
+ p_src[-2] = p_src[-1] = p_src[0];
+ p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
+
+ for (col = 0; col < cols; col++) {
+ v = p_src[col];
+
+ if ((abs(v - p_src[col - 2]) < flimits[col]) &&
+ (abs(v - p_src[col - 1]) < flimits[col]) &&
+ (abs(v - p_src[col + 1]) < flimits[col]) &&
+ (abs(v - p_src[col + 2]) < flimits[col])) {
+ unsigned char k1, k2, k3;
+ k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
+ k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
+ k3 = (k1 + k2 + 1) >> 1;
+ v = (k3 + v + 1) >> 1;
+ }
+
+ d[col & 3] = v;
+
+ if (col >= 2) p_dst[col - 2] = d[(col - 2) & 3];
+ }
+
+ /* handle the last two pixels */
+ p_dst[col - 2] = d[(col - 2) & 3];
+ p_dst[col - 1] = d[(col - 1) & 3];
+
+ /* next row */
+ src += src_pitch;
+ dst += dst_pitch;
+ }
+}
+
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
+ int cols, int flimit) {
+ int r, c, i;
+
+ unsigned char *s = src;
+ unsigned char d[16];
+
+ for (r = 0; r < rows; r++) {
+ int sumsq = 16;
+ int sum = 0;
+
+ for (i = -8; i < 0; i++) s[i] = s[0];
+
+ /* 17 avoids valgrind warning - we buffer values in c in d
+ * and only write them when we've read 8 ahead...
+ */
+ for (i = 0; i < 17; i++) s[i + cols] = s[cols - 1];
+
+ for (i = -8; i <= 6; i++) {
+ sumsq += s[i] * s[i];
+ sum += s[i];
+ d[i + 8] = 0;
+ }
+
+ for (c = 0; c < cols + 8; c++) {
+ int x = s[c + 7] - s[c - 8];
+ int y = s[c + 7] + s[c - 8];
+
+ sum += x;
+ sumsq += x * y;
+
+ d[c & 15] = s[c];
+
+ if (sumsq * 15 - sum * sum < flimit) {
+ d[c & 15] = (8 + sum + s[c]) >> 4;
+ }
+
+ s[c - 8] = d[(c - 8) & 15];
+ }
+
+ s += pitch;
+ }
+}
+
+void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
+ int flimit) {
+ int r, c, i;
+
+ for (c = 0; c < cols; c++) {
+ unsigned char *s = &dst[c];
+ int sumsq = 0;
+ int sum = 0;
+ unsigned char d[16];
+
+ for (i = -8; i < 0; i++) s[i * pitch] = s[0];
+
+ /* 17 avoids valgrind warning - we buffer values in c in d
+ * and only write them when we've read 8 ahead...
+ */
+ for (i = 0; i < 17; i++) s[(i + rows) * pitch] = s[(rows - 1) * pitch];
+
+ for (i = -8; i <= 6; i++) {
+ sumsq += s[i * pitch] * s[i * pitch];
+ sum += s[i * pitch];
+ }
+
+ for (r = 0; r < rows + 8; r++) {
+ sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+ sum += s[7 * pitch] - s[-8 * pitch];
+ d[r & 15] = s[0];
+
+ if (sumsq * 15 - sum * sum < flimit) {
+ d[r & 15] = (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4;
+ }
+ if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
+ s += pitch;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/fastssim.c b/media/libvpx/libvpx/vpx_dsp/fastssim.c
new file mode 100644
index 0000000000..4d32a02a55
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/fastssim.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * This code was originally written by: Nathan E. Egge, at the Daala
+ * project.
+ */
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/system_state.h"
+
+typedef struct fs_level fs_level;
+typedef struct fs_ctx fs_ctx;
+
+#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
+#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
+#if CONFIG_VP9_HIGHBITDEPTH
+#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01)
+#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
+#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
+#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
+#endif
+#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
+#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
+
+struct fs_level {
+ uint32_t *im1;
+ uint32_t *im2;
+ double *ssim;
+ int w;
+ int h;
+};
+
+struct fs_ctx {
+ fs_level *level;
+ int nlevels;
+ unsigned *col_buf;
+};
+
+static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+ unsigned char *data;
+ size_t data_size;
+ int lw;
+ int lh;
+ int l;
+ lw = (_w + 1) >> 1;
+ lh = (_h + 1) >> 1;
+ data_size =
+ _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+ for (l = 0; l < _nlevels; l++) {
+ size_t im_size;
+ size_t level_size;
+ im_size = lw * (size_t)lh;
+ level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+ level_size += sizeof(*_ctx->level[l].ssim) - 1;
+ level_size /= sizeof(*_ctx->level[l].ssim);
+ level_size += im_size;
+ level_size *= sizeof(*_ctx->level[l].ssim);
+ data_size += level_size;
+ lw = (lw + 1) >> 1;
+ lh = (lh + 1) >> 1;
+ }
+ data = (unsigned char *)malloc(data_size);
+ if (!data) return -1;
+ _ctx->level = (fs_level *)data;
+ _ctx->nlevels = _nlevels;
+ data += _nlevels * sizeof(*_ctx->level);
+ lw = (_w + 1) >> 1;
+ lh = (_h + 1) >> 1;
+ for (l = 0; l < _nlevels; l++) {
+ size_t im_size;
+ size_t level_size;
+ _ctx->level[l].w = lw;
+ _ctx->level[l].h = lh;
+ im_size = lw * (size_t)lh;
+ level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+ level_size += sizeof(*_ctx->level[l].ssim) - 1;
+ level_size /= sizeof(*_ctx->level[l].ssim);
+ level_size *= sizeof(*_ctx->level[l].ssim);
+ _ctx->level[l].im1 = (uint32_t *)data;
+ _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
+ data += level_size;
+ _ctx->level[l].ssim = (double *)data;
+ data += im_size * sizeof(*_ctx->level[l].ssim);
+ lw = (lw + 1) >> 1;
+ lh = (lh + 1) >> 1;
+ }
+ _ctx->col_buf = (unsigned *)data;
+ return 0;
+}
+
+static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
+
+static void fs_downsample_level(fs_ctx *_ctx, int _l) {
+ const uint32_t *src1;
+ const uint32_t *src2;
+ uint32_t *dst1;
+ uint32_t *dst2;
+ int w2;
+ int h2;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ dst1 = _ctx->level[_l].im1;
+ dst2 = _ctx->level[_l].im2;
+ w2 = _ctx->level[_l - 1].w;
+ h2 = _ctx->level[_l - 1].h;
+ src1 = _ctx->level[_l - 1].im1;
+ src2 = _ctx->level[_l - 1].im2;
+ for (j = 0; j < h; j++) {
+ int j0offs;
+ int j1offs;
+ j0offs = 2 * j * w2;
+ j1offs = FS_MINI(2 * j + 1, h2) * w2;
+ for (i = 0; i < w; i++) {
+ int i0;
+ int i1;
+ i0 = 2 * i;
+ i1 = FS_MINI(i0 + 1, w2);
+ dst1[j * w + i] =
+ (uint32_t)((int64_t)src1[j0offs + i0] + src1[j0offs + i1] +
+ src1[j1offs + i0] + src1[j1offs + i1]);
+ dst2[j * w + i] =
+ (uint32_t)((int64_t)src2[j0offs + i0] + src2[j0offs + i1] +
+ src2[j1offs + i0] + src2[j1offs + i1]);
+ }
+ }
+}
+
+static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
+ int _s1ystride, const uint8_t *_src2,
+ int _s2ystride, int _w, int _h, uint32_t bd,
+ uint32_t shift) {
+ uint32_t *dst1;
+ uint32_t *dst2;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[0].w;
+ h = _ctx->level[0].h;
+ dst1 = _ctx->level[0].im1;
+ dst2 = _ctx->level[0].im2;
+ for (j = 0; j < h; j++) {
+ int j0;
+ int j1;
+ j0 = 2 * j;
+ j1 = FS_MINI(j0 + 1, _h);
+ for (i = 0; i < w; i++) {
+ int i0;
+ int i1;
+ i0 = 2 * i;
+ i1 = FS_MINI(i0 + 1, _w);
+ if (bd == 8 && shift == 0) {
+ dst1[j * w + i] =
+ _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
+ _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
+ dst2[j * w + i] =
+ _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
+ _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
+ } else {
+ uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
+ uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
+ dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
+ (src1s[j0 * _s1ystride + i1] >> shift) +
+ (src1s[j1 * _s1ystride + i0] >> shift) +
+ (src1s[j1 * _s1ystride + i1] >> shift);
+ dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
+ (src2s[j0 * _s2ystride + i1] >> shift) +
+ (src2s[j1 * _s2ystride + i0] >> shift) +
+ (src2s[j1 * _s2ystride + i1] >> shift);
+ }
+ }
+ }
+}
+
+static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
+ unsigned *col_sums_x;
+ unsigned *col_sums_y;
+ uint32_t *im1;
+ uint32_t *im2;
+ double *ssim;
+ double c1;
+ int w;
+ int h;
+ int j0offs;
+ int j1offs;
+ int i;
+ int j;
+ double ssim_c1 = SSIM_C1;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
+ if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
+#else
+ assert(bit_depth == 8);
+ (void)bit_depth;
+#endif
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ col_sums_x = _ctx->col_buf;
+ col_sums_y = col_sums_x + w;
+ im1 = _ctx->level[_l].im1;
+ im2 = _ctx->level[_l].im2;
+ for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
+ for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
+ for (j = 1; j < 4; j++) {
+ j1offs = FS_MINI(j, h - 1) * w;
+ for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+ for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+ }
+ ssim = _ctx->level[_l].ssim;
+ c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
+ for (j = 0; j < h; j++) {
+ int64_t mux;
+ int64_t muy;
+ int i0;
+ int i1;
+ mux = (int64_t)5 * col_sums_x[0];
+ muy = (int64_t)5 * col_sums_y[0];
+ for (i = 1; i < 4; i++) {
+ i1 = FS_MINI(i, w - 1);
+ mux += col_sums_x[i1];
+ muy += col_sums_y[i1];
+ }
+ for (i = 0; i < w; i++) {
+ ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
+ (mux * (double)mux + muy * (double)muy + c1);
+ if (i + 1 < w) {
+ i0 = FS_MAXI(0, i - 4);
+ i1 = FS_MINI(i + 4, w - 1);
+ mux += (int)col_sums_x[i1] - (int)col_sums_x[i0];
+ muy += (int)col_sums_x[i1] - (int)col_sums_x[i0];
+ }
+ }
+ if (j + 1 < h) {
+ j0offs = FS_MAXI(0, j - 4) * w;
+ for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
+ for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
+ j1offs = FS_MINI(j + 4, h - 1) * w;
+ for (i = 0; i < w; i++)
+ col_sums_x[i] = (uint32_t)((int64_t)col_sums_x[i] + im1[j1offs + i]);
+ for (i = 0; i < w; i++)
+ col_sums_y[i] = (uint32_t)((int64_t)col_sums_y[i] + im2[j1offs + i]);
+ }
+ }
+}
+
+#define FS_COL_SET(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] = gx * (double)gx; \
+ col_sums_gy2[(_col)] = gy * (double)gy; \
+ col_sums_gxgy[(_col)] = gx * (double)gy; \
+ } while (0)
+
+#define FS_COL_ADD(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] += gx * (double)gx; \
+ col_sums_gy2[(_col)] += gy * (double)gy; \
+ col_sums_gxgy[(_col)] += gx * (double)gy; \
+ } while (0)
+
+#define FS_COL_SUB(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] -= gx * (double)gx; \
+ col_sums_gy2[(_col)] -= gy * (double)gy; \
+ col_sums_gxgy[(_col)] -= gx * (double)gy; \
+ } while (0)
+
+#define FS_COL_COPY(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
+ } while (0)
+
+#define FS_COL_HALVE(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
+ } while (0)
+
+#define FS_COL_DOUBLE(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
+ } while (0)
+
+static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
+ uint32_t *im1;
+ uint32_t *im2;
+ unsigned *gx_buf;
+ unsigned *gy_buf;
+ double *ssim;
+ double col_sums_gx2[8];
+ double col_sums_gy2[8];
+ double col_sums_gxgy[8];
+ double c2;
+ int stride;
+ int w;
+ int h;
+ int i;
+ int j;
+ double ssim_c2 = SSIM_C2;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
+ if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
+#else
+ assert(bit_depth == 8);
+ (void)bit_depth;
+#endif
+
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ im1 = _ctx->level[_l].im1;
+ im2 = _ctx->level[_l].im2;
+ ssim = _ctx->level[_l].ssim;
+ gx_buf = _ctx->col_buf;
+ stride = w + 8;
+ gy_buf = gx_buf + 8 * stride;
+ memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
+ c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104;
+ for (j = 0; j < h + 4; j++) {
+ if (j < h - 1) {
+ for (i = 0; i < w - 1; i++) {
+ int64_t g1;
+ int64_t g2;
+ int64_t gx;
+ int64_t gy;
+ g1 = labs((int64_t)im1[(j + 1) * w + i + 1] - (int64_t)im1[j * w + i]);
+ g2 = labs((int64_t)im1[(j + 1) * w + i] - (int64_t)im1[j * w + i + 1]);
+ gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+ g1 = labs((int64_t)im2[(j + 1) * w + i + 1] - (int64_t)im2[j * w + i]);
+ g2 = labs((int64_t)im2[(j + 1) * w + i] - (int64_t)im2[j * w + i + 1]);
+ gy = ((int64_t)4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2));
+ gx_buf[(j & 7) * stride + i + 4] = (uint32_t)gx;
+ gy_buf[(j & 7) * stride + i + 4] = (uint32_t)gy;
+ }
+ } else {
+ memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
+ memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
+ }
+ if (j >= 4) {
+ int k;
+ col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
+ col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
+ col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
+ col_sums_gxgy[0] = 0;
+ for (i = 4; i < 8; i++) {
+ FS_COL_SET(i, -1, 0);
+ FS_COL_ADD(i, 0, 0);
+ for (k = 1; k < 8 - i; k++) {
+ FS_COL_DOUBLE(i, i);
+ FS_COL_ADD(i, -k - 1, 0);
+ FS_COL_ADD(i, k, 0);
+ }
+ }
+ for (i = 0; i < w; i++) {
+ double mugx2;
+ double mugy2;
+ double mugxgy;
+ mugx2 = col_sums_gx2[0];
+ for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
+ mugy2 = col_sums_gy2[0];
+ for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
+ mugxgy = col_sums_gxgy[0];
+ for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
+ ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
+ if (i + 1 < w) {
+ FS_COL_SET(0, -1, 1);
+ FS_COL_ADD(0, 0, 1);
+ FS_COL_SUB(2, -3, 2);
+ FS_COL_SUB(2, 2, 2);
+ FS_COL_HALVE(1, 2);
+ FS_COL_SUB(3, -4, 3);
+ FS_COL_SUB(3, 3, 3);
+ FS_COL_HALVE(2, 3);
+ FS_COL_COPY(3, 4);
+ FS_COL_DOUBLE(4, 5);
+ FS_COL_ADD(4, -4, 5);
+ FS_COL_ADD(4, 3, 5);
+ FS_COL_DOUBLE(5, 6);
+ FS_COL_ADD(5, -3, 6);
+ FS_COL_ADD(5, 2, 6);
+ FS_COL_DOUBLE(6, 7);
+ FS_COL_ADD(6, -2, 7);
+ FS_COL_ADD(6, 1, 7);
+ FS_COL_SET(7, -1, 8);
+ FS_COL_ADD(7, 0, 8);
+ }
+ }
+ }
+ }
+}
+
+#define FS_NLEVELS (4)
+
+/*These weights were derived from the default weights found in Wang's original
+ Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
+ We drop the finest scale and renormalize the rest to sum to 1.*/
+
+static const double FS_WEIGHTS[FS_NLEVELS] = {
+ 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
+};
+
+static double fs_average(fs_ctx *_ctx, int _l) {
+ double *ssim;
+ double ret;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ ssim = _ctx->level[_l].ssim;
+ ret = 0;
+ for (j = 0; j < h; j++)
+ for (i = 0; i < w; i++) ret += ssim[j * w + i];
+ return pow(ret / (w * h), FS_WEIGHTS[_l]);
+}
+
+static double convert_ssim_db(double _ssim, double _weight) {
+ assert(_weight >= _ssim);
+ if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
+ return 10 * (log10(_weight) - log10(_weight - _ssim));
+}
+
+static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
+ int _dystride, int _w, int _h, uint32_t _bd,
+ uint32_t _shift) {
+ fs_ctx ctx;
+ double ret;
+ int l;
+ ret = 1;
+ if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0;
+ fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
+ _shift);
+ for (l = 0; l < FS_NLEVELS - 1; l++) {
+ fs_calc_structure(&ctx, l, _bd);
+ ret *= fs_average(&ctx, l);
+ fs_downsample_level(&ctx, l + 1);
+ }
+ fs_calc_structure(&ctx, l, _bd);
+ fs_apply_luminance(&ctx, l, _bd);
+ ret *= fs_average(&ctx, l);
+ fs_ctx_clear(&ctx);
+ return ret;
+}
+
+double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+ double *ssim_u, double *ssim_v, uint32_t bd,
+ uint32_t in_bd) {
+ double ssimv;
+ uint32_t bd_shift = 0;
+ vpx_clear_system_state();
+ assert(bd >= in_bd);
+ bd_shift = bd - in_bd;
+
+ *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
+ dest->y_stride, source->y_crop_width,
+ source->y_crop_height, in_bd, bd_shift);
+ *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, bd_shift);
+ *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, bd_shift);
+
+ ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
+ return convert_ssim_db(ssimv, 1.0);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c
new file mode 100644
index 0000000000..ef66de0247
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c
@@ -0,0 +1,809 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/fwd_txfm.h"
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we transpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ tran_low_t intermediate[4 * 4];
+ const tran_low_t *in_low = NULL;
+ tran_low_t *out = intermediate;
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ tran_high_t in_high[4]; // canbe16
+ tran_high_t step[4]; // canbe16
+ tran_high_t temp1, temp2; // needs32
+ int i;
+ for (i = 0; i < 4; ++i) {
+ // Load inputs.
+ if (pass == 0) {
+ in_high[0] = input[0 * stride] * 16;
+ in_high[1] = input[1 * stride] * 16;
+ in_high[2] = input[2 * stride] * 16;
+ in_high[3] = input[3 * stride] * 16;
+ if (i == 0 && in_high[0]) {
+ ++in_high[0];
+ }
+ } else {
+ assert(in_low != NULL);
+ in_high[0] = in_low[0 * 4];
+ in_high[1] = in_low[1 * 4];
+ in_high[2] = in_low[2 * 4];
+ in_high[3] = in_low[3 * 4];
+ ++in_low;
+ }
+ // Transform.
+ step[0] = in_high[0] + in_high[3];
+ step[1] = in_high[1] + in_high[2];
+ step[2] = in_high[1] - in_high[2];
+ step[3] = in_high[0] - in_high[3];
+ temp1 = (step[0] + step[1]) * cospi_16_64;
+ temp2 = (step[0] - step[1]) * cospi_16_64;
+ out[0] = (tran_low_t)fdct_round_shift(temp1);
+ out[2] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+ temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+ out[1] = (tran_low_t)fdct_round_shift(temp1);
+ out[3] = (tran_low_t)fdct_round_shift(temp2);
+ // Do next column (which is a transposed row in second/horizontal pass)
+ ++input;
+ out += 4;
+ }
+ // Setup in/out for next pass.
+ in_low = intermediate;
+ out = output;
+ }
+
+ {
+ int i, j;
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+ }
+ }
+}
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 4; ++r)
+ for (c = 0; c < 4; ++c) sum += input[r * stride + c];
+
+ output[0] = sum * 2;
+}
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) {
+ int i, j;
+ tran_low_t intermediate[64];
+ int pass;
+ tran_low_t *out = intermediate;
+ const tran_low_t *in = NULL;
+
+ // Transform columns
+ for (pass = 0; pass < 2; ++pass) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
+
+ for (i = 0; i < 8; i++) {
+ // stage 1
+ if (pass == 0) {
+ s0 = (input[0 * stride] + input[7 * stride]) * 4;
+ s1 = (input[1 * stride] + input[6 * stride]) * 4;
+ s2 = (input[2 * stride] + input[5 * stride]) * 4;
+ s3 = (input[3 * stride] + input[4 * stride]) * 4;
+ s4 = (input[3 * stride] - input[4 * stride]) * 4;
+ s5 = (input[2 * stride] - input[5 * stride]) * 4;
+ s6 = (input[1 * stride] - input[6 * stride]) * 4;
+ s7 = (input[0 * stride] - input[7 * stride]) * 4;
+ ++input;
+ } else {
+ s0 = in[0 * 8] + in[7 * 8];
+ s1 = in[1 * 8] + in[6 * 8];
+ s2 = in[2 * 8] + in[5 * 8];
+ s3 = in[3 * 8] + in[4 * 8];
+ s4 = in[3 * 8] - in[4 * 8];
+ s5 = in[2 * 8] - in[5 * 8];
+ s6 = in[1 * 8] - in[6 * 8];
+ s7 = in[0 * 8] - in[7 * 8];
+ ++in;
+ }
+
+ // fdct4(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+ t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+ out[0] = (tran_low_t)fdct_round_shift(t0);
+ out[2] = (tran_low_t)fdct_round_shift(t2);
+ out[4] = (tran_low_t)fdct_round_shift(t1);
+ out[6] = (tran_low_t)fdct_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = fdct_round_shift(t0);
+ t3 = fdct_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ out[1] = (tran_low_t)fdct_round_shift(t0);
+ out[3] = (tran_low_t)fdct_round_shift(t2);
+ out[5] = (tran_low_t)fdct_round_shift(t1);
+ out[7] = (tran_low_t)fdct_round_shift(t3);
+ out += 8;
+ }
+ in = intermediate;
+ out = output;
+ }
+
+ // Rows
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) output[j + i * 8] /= 2;
+ }
+}
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 8; ++r)
+ for (c = 0; c < 8; ++c) sum += input[r * stride + c];
+
+ output[0] = sum;
+}
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we transpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ tran_low_t intermediate[256];
+ const tran_low_t *in_low = NULL;
+ tran_low_t *out = intermediate;
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ tran_high_t step1[8]; // canbe16
+ tran_high_t step2[8]; // canbe16
+ tran_high_t step3[8]; // canbe16
+ tran_high_t in_high[8]; // canbe16
+ tran_high_t temp1, temp2; // needs32
+ int i;
+ for (i = 0; i < 16; i++) {
+ if (0 == pass) {
+ // Calculate input for the first 8 results.
+ in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
+ in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
+ in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
+ in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
+ in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
+ in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
+ in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
+ in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
+ // Calculate input for the next 8 results.
+ step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
+ step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
+ step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
+ step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
+ step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
+ step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
+ step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
+ step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
+ } else {
+ // Calculate input for the first 8 results.
+ assert(in_low != NULL);
+ in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
+ in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
+ in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
+ in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
+ in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
+ in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
+ in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
+ in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
+ // Calculate input for the next 8 results.
+ step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
+ step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
+ step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
+ step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
+ step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
+ step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
+ step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
+ step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
+ in_low++;
+ }
+ // Work on the first eight values; fdct8(input, even_results);
+ {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
+
+ // stage 1
+ s0 = in_high[0] + in_high[7];
+ s1 = in_high[1] + in_high[6];
+ s2 = in_high[2] + in_high[5];
+ s3 = in_high[3] + in_high[4];
+ s4 = in_high[3] - in_high[4];
+ s5 = in_high[2] - in_high[5];
+ s6 = in_high[1] - in_high[6];
+ s7 = in_high[0] - in_high[7];
+
+ // fdct4(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
+ t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+ out[0] = (tran_low_t)fdct_round_shift(t0);
+ out[4] = (tran_low_t)fdct_round_shift(t2);
+ out[8] = (tran_low_t)fdct_round_shift(t1);
+ out[12] = (tran_low_t)fdct_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = fdct_round_shift(t0);
+ t3 = fdct_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ out[2] = (tran_low_t)fdct_round_shift(t0);
+ out[6] = (tran_low_t)fdct_round_shift(t2);
+ out[10] = (tran_low_t)fdct_round_shift(t1);
+ out[14] = (tran_low_t)fdct_round_shift(t3);
+ }
+ // Work on the next eight values; step1 -> odd_results
+ {
+ // step 2
+ temp1 = (step1[5] - step1[2]) * cospi_16_64;
+ temp2 = (step1[4] - step1[3]) * cospi_16_64;
+ step2[2] = fdct_round_shift(temp1);
+ step2[3] = fdct_round_shift(temp2);
+ temp1 = (step1[4] + step1[3]) * cospi_16_64;
+ temp2 = (step1[5] + step1[2]) * cospi_16_64;
+ step2[4] = fdct_round_shift(temp1);
+ step2[5] = fdct_round_shift(temp2);
+ // step 3
+ step3[0] = step1[0] + step2[3];
+ step3[1] = step1[1] + step2[2];
+ step3[2] = step1[1] - step2[2];
+ step3[3] = step1[0] - step2[3];
+ step3[4] = step1[7] - step2[4];
+ step3[5] = step1[6] - step2[5];
+ step3[6] = step1[6] + step2[5];
+ step3[7] = step1[7] + step2[4];
+ // step 4
+ temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
+ temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
+ step2[1] = fdct_round_shift(temp1);
+ step2[2] = fdct_round_shift(temp2);
+ temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+ temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
+ step2[5] = fdct_round_shift(temp1);
+ step2[6] = fdct_round_shift(temp2);
+ // step 5
+ step1[0] = step3[0] + step2[1];
+ step1[1] = step3[0] - step2[1];
+ step1[2] = step3[3] + step2[2];
+ step1[3] = step3[3] - step2[2];
+ step1[4] = step3[4] - step2[5];
+ step1[5] = step3[4] + step2[5];
+ step1[6] = step3[7] - step2[6];
+ step1[7] = step3[7] + step2[6];
+ // step 6
+ temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
+ temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+ out[1] = (tran_low_t)fdct_round_shift(temp1);
+ out[9] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+ temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
+ out[5] = (tran_low_t)fdct_round_shift(temp1);
+ out[13] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
+ temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+ out[3] = (tran_low_t)fdct_round_shift(temp1);
+ out[11] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+ temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
+ out[7] = (tran_low_t)fdct_round_shift(temp1);
+ out[15] = (tran_low_t)fdct_round_shift(temp2);
+ }
+ // Do next column (which is a transposed row in second/horizontal pass)
+ input++;
+ out += 16;
+ }
+ // Setup in/out for next pass.
+ in_low = intermediate;
+ out = output;
+ }
+}
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ int sum = 0;
+ for (r = 0; r < 16; ++r)
+ for (c = 0; c < 16; ++c) sum += input[r * stride + c];
+
+ output[0] = (tran_low_t)(sum >> 1);
+}
+
+static INLINE tran_high_t dct_32_round(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ // TODO(debargha, peter.derivaz): Find new bounds for this assert,
+ // and make the bounds consts.
+ // assert(-131072 <= rv && rv <= 131071);
+ return rv;
+}
+
+static INLINE tran_high_t half_round_shift(tran_high_t input) {
+ tran_high_t rv = (input + 1 + (input < 0)) >> 2;
+ return rv;
+}
+
+void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+ tran_high_t step[32];
+ // Stage 1
+ step[0] = input[0] + input[(32 - 1)];
+ step[1] = input[1] + input[(32 - 2)];
+ step[2] = input[2] + input[(32 - 3)];
+ step[3] = input[3] + input[(32 - 4)];
+ step[4] = input[4] + input[(32 - 5)];
+ step[5] = input[5] + input[(32 - 6)];
+ step[6] = input[6] + input[(32 - 7)];
+ step[7] = input[7] + input[(32 - 8)];
+ step[8] = input[8] + input[(32 - 9)];
+ step[9] = input[9] + input[(32 - 10)];
+ step[10] = input[10] + input[(32 - 11)];
+ step[11] = input[11] + input[(32 - 12)];
+ step[12] = input[12] + input[(32 - 13)];
+ step[13] = input[13] + input[(32 - 14)];
+ step[14] = input[14] + input[(32 - 15)];
+ step[15] = input[15] + input[(32 - 16)];
+ step[16] = -input[16] + input[(32 - 17)];
+ step[17] = -input[17] + input[(32 - 18)];
+ step[18] = -input[18] + input[(32 - 19)];
+ step[19] = -input[19] + input[(32 - 20)];
+ step[20] = -input[20] + input[(32 - 21)];
+ step[21] = -input[21] + input[(32 - 22)];
+ step[22] = -input[22] + input[(32 - 23)];
+ step[23] = -input[23] + input[(32 - 24)];
+ step[24] = -input[24] + input[(32 - 25)];
+ step[25] = -input[25] + input[(32 - 26)];
+ step[26] = -input[26] + input[(32 - 27)];
+ step[27] = -input[27] + input[(32 - 28)];
+ step[28] = -input[28] + input[(32 - 29)];
+ step[29] = -input[29] + input[(32 - 30)];
+ step[30] = -input[30] + input[(32 - 31)];
+ step[31] = -input[31] + input[(32 - 32)];
+
+ // Stage 2
+ output[0] = step[0] + step[16 - 1];
+ output[1] = step[1] + step[16 - 2];
+ output[2] = step[2] + step[16 - 3];
+ output[3] = step[3] + step[16 - 4];
+ output[4] = step[4] + step[16 - 5];
+ output[5] = step[5] + step[16 - 6];
+ output[6] = step[6] + step[16 - 7];
+ output[7] = step[7] + step[16 - 8];
+ output[8] = -step[8] + step[16 - 9];
+ output[9] = -step[9] + step[16 - 10];
+ output[10] = -step[10] + step[16 - 11];
+ output[11] = -step[11] + step[16 - 12];
+ output[12] = -step[12] + step[16 - 13];
+ output[13] = -step[13] + step[16 - 14];
+ output[14] = -step[14] + step[16 - 15];
+ output[15] = -step[15] + step[16 - 16];
+
+ output[16] = step[16];
+ output[17] = step[17];
+ output[18] = step[18];
+ output[19] = step[19];
+
+ output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
+ output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
+ output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
+ output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
+
+ output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
+ output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
+ output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
+ output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
+
+ output[28] = step[28];
+ output[29] = step[29];
+ output[30] = step[30];
+ output[31] = step[31];
+
+ // dump the magnitude by 4, hence the intermediate values are within
+ // the range of 16 bits.
+ if (round) {
+ output[0] = half_round_shift(output[0]);
+ output[1] = half_round_shift(output[1]);
+ output[2] = half_round_shift(output[2]);
+ output[3] = half_round_shift(output[3]);
+ output[4] = half_round_shift(output[4]);
+ output[5] = half_round_shift(output[5]);
+ output[6] = half_round_shift(output[6]);
+ output[7] = half_round_shift(output[7]);
+ output[8] = half_round_shift(output[8]);
+ output[9] = half_round_shift(output[9]);
+ output[10] = half_round_shift(output[10]);
+ output[11] = half_round_shift(output[11]);
+ output[12] = half_round_shift(output[12]);
+ output[13] = half_round_shift(output[13]);
+ output[14] = half_round_shift(output[14]);
+ output[15] = half_round_shift(output[15]);
+
+ output[16] = half_round_shift(output[16]);
+ output[17] = half_round_shift(output[17]);
+ output[18] = half_round_shift(output[18]);
+ output[19] = half_round_shift(output[19]);
+ output[20] = half_round_shift(output[20]);
+ output[21] = half_round_shift(output[21]);
+ output[22] = half_round_shift(output[22]);
+ output[23] = half_round_shift(output[23]);
+ output[24] = half_round_shift(output[24]);
+ output[25] = half_round_shift(output[25]);
+ output[26] = half_round_shift(output[26]);
+ output[27] = half_round_shift(output[27]);
+ output[28] = half_round_shift(output[28]);
+ output[29] = half_round_shift(output[29]);
+ output[30] = half_round_shift(output[30]);
+ output[31] = half_round_shift(output[31]);
+ }
+
+ // Stage 3
+ step[0] = output[0] + output[(8 - 1)];
+ step[1] = output[1] + output[(8 - 2)];
+ step[2] = output[2] + output[(8 - 3)];
+ step[3] = output[3] + output[(8 - 4)];
+ step[4] = -output[4] + output[(8 - 5)];
+ step[5] = -output[5] + output[(8 - 6)];
+ step[6] = -output[6] + output[(8 - 7)];
+ step[7] = -output[7] + output[(8 - 8)];
+ step[8] = output[8];
+ step[9] = output[9];
+ step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
+ step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
+ step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
+ step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
+ step[14] = output[14];
+ step[15] = output[15];
+
+ step[16] = output[16] + output[23];
+ step[17] = output[17] + output[22];
+ step[18] = output[18] + output[21];
+ step[19] = output[19] + output[20];
+ step[20] = -output[20] + output[19];
+ step[21] = -output[21] + output[18];
+ step[22] = -output[22] + output[17];
+ step[23] = -output[23] + output[16];
+ step[24] = -output[24] + output[31];
+ step[25] = -output[25] + output[30];
+ step[26] = -output[26] + output[29];
+ step[27] = -output[27] + output[28];
+ step[28] = output[28] + output[27];
+ step[29] = output[29] + output[26];
+ step[30] = output[30] + output[25];
+ step[31] = output[31] + output[24];
+
+ // Stage 4
+ output[0] = step[0] + step[3];
+ output[1] = step[1] + step[2];
+ output[2] = -step[2] + step[1];
+ output[3] = -step[3] + step[0];
+ output[4] = step[4];
+ output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
+ output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
+ output[7] = step[7];
+ output[8] = step[8] + step[11];
+ output[9] = step[9] + step[10];
+ output[10] = -step[10] + step[9];
+ output[11] = -step[11] + step[8];
+ output[12] = -step[12] + step[15];
+ output[13] = -step[13] + step[14];
+ output[14] = step[14] + step[13];
+ output[15] = step[15] + step[12];
+
+ output[16] = step[16];
+ output[17] = step[17];
+ output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
+ output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
+ output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
+ output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
+ output[22] = step[22];
+ output[23] = step[23];
+ output[24] = step[24];
+ output[25] = step[25];
+ output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
+ output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
+ output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
+ output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
+ output[30] = step[30];
+ output[31] = step[31];
+
+ // Stage 5
+ step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
+ step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
+ step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
+ step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
+ step[4] = output[4] + output[5];
+ step[5] = -output[5] + output[4];
+ step[6] = -output[6] + output[7];
+ step[7] = output[7] + output[6];
+ step[8] = output[8];
+ step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
+ step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
+ step[11] = output[11];
+ step[12] = output[12];
+ step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
+ step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
+ step[15] = output[15];
+
+ step[16] = output[16] + output[19];
+ step[17] = output[17] + output[18];
+ step[18] = -output[18] + output[17];
+ step[19] = -output[19] + output[16];
+ step[20] = -output[20] + output[23];
+ step[21] = -output[21] + output[22];
+ step[22] = output[22] + output[21];
+ step[23] = output[23] + output[20];
+ step[24] = output[24] + output[27];
+ step[25] = output[25] + output[26];
+ step[26] = -output[26] + output[25];
+ step[27] = -output[27] + output[24];
+ step[28] = -output[28] + output[31];
+ step[29] = -output[29] + output[30];
+ step[30] = output[30] + output[29];
+ step[31] = output[31] + output[28];
+
+ // Stage 6
+ output[0] = step[0];
+ output[1] = step[1];
+ output[2] = step[2];
+ output[3] = step[3];
+ output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
+ output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
+ output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
+ output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
+ output[8] = step[8] + step[9];
+ output[9] = -step[9] + step[8];
+ output[10] = -step[10] + step[11];
+ output[11] = step[11] + step[10];
+ output[12] = step[12] + step[13];
+ output[13] = -step[13] + step[12];
+ output[14] = -step[14] + step[15];
+ output[15] = step[15] + step[14];
+
+ output[16] = step[16];
+ output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
+ output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
+ output[19] = step[19];
+ output[20] = step[20];
+ output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
+ output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
+ output[23] = step[23];
+ output[24] = step[24];
+ output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
+ output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
+ output[27] = step[27];
+ output[28] = step[28];
+ output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
+ output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
+ output[31] = step[31];
+
+ // Stage 7
+ step[0] = output[0];
+ step[1] = output[1];
+ step[2] = output[2];
+ step[3] = output[3];
+ step[4] = output[4];
+ step[5] = output[5];
+ step[6] = output[6];
+ step[7] = output[7];
+ step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
+ step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
+ step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
+ step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
+ step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
+ step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
+ step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
+ step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
+
+ step[16] = output[16] + output[17];
+ step[17] = -output[17] + output[16];
+ step[18] = -output[18] + output[19];
+ step[19] = output[19] + output[18];
+ step[20] = output[20] + output[21];
+ step[21] = -output[21] + output[20];
+ step[22] = -output[22] + output[23];
+ step[23] = output[23] + output[22];
+ step[24] = output[24] + output[25];
+ step[25] = -output[25] + output[24];
+ step[26] = -output[26] + output[27];
+ step[27] = output[27] + output[26];
+ step[28] = output[28] + output[29];
+ step[29] = -output[29] + output[28];
+ step[30] = -output[30] + output[31];
+ step[31] = output[31] + output[30];
+
+ // Final stage --- outputs indices are bit-reversed.
+ output[0] = step[0];
+ output[16] = step[1];
+ output[8] = step[2];
+ output[24] = step[3];
+ output[4] = step[4];
+ output[20] = step[5];
+ output[12] = step[6];
+ output[28] = step[7];
+ output[2] = step[8];
+ output[18] = step[9];
+ output[10] = step[10];
+ output[26] = step[11];
+ output[6] = step[12];
+ output[22] = step[13];
+ output[14] = step[14];
+ output[30] = step[15];
+
+ output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+ output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
+ output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+ output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
+ output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+ output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
+ output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
+ output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
+ output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+ output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
+ output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
+ output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
+ output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+ output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
+ output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
+ output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
+}
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) {
+ int i, j;
+ tran_high_t out[32 * 32];
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
+ vpx_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
+ vpx_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ output[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ }
+}
+
+// Note that although we use dct_32_round in dct32 computation flow,
+// this 2d fdct32x32 for rate-distortion optimization loop is operating
+// within 16 bits precision.
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) {
+ int i, j;
+ tran_high_t out[32 * 32];
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
+ vpx_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ // TODO(cd): see quality impact of only doing
+ // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
+ // PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c
+ out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
+ vpx_fdct32(temp_in, temp_out, 1);
+ for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j];
+ }
+}
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ int sum = 0;
+ for (r = 0; r < 32; ++r)
+ for (c = 0; c < 32; ++c) sum += input[r * stride + c];
+
+ output[0] = (tran_low_t)(sum >> 3);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct4x4_c(input, output, stride);
+}
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct8x8_c(input, output, stride);
+}
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct8x8_1_c(input, output, stride);
+}
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct16x16_c(input, output, stride);
+}
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct16x16_1_c(input, output, stride);
+}
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct32x32_c(input, output, stride);
+}
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct32x32_rd_c(input, output, stride);
+}
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct32x32_1_c(input, output, stride);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h
new file mode 100644
index 0000000000..a43c8ea7f7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_FWD_TXFM_H_
+#define VPX_VPX_DSP_FWD_TXFM_H_
+
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ // TODO(debargha, peter.derivaz): Find new bounds for this assert
+ // and make the bounds consts.
+ // assert(INT16_MIN <= rv && rv <= INT16_MAX);
+ return rv;
+}
+
+void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round);
+#endif // VPX_VPX_DSP_FWD_TXFM_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/intrapred.c b/media/libvpx/libvpx/vpx_dsp/intrapred.c
new file mode 100644
index 0000000000..400e632e98
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/intrapred.c
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define DST(x, y) dst[(x) + (y)*stride]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+ (void)above;
+ // first column
+ for (r = 0; r < bs - 1; ++r) dst[r * stride] = AVG2(left[r], left[r + 1]);
+ dst[(bs - 1) * stride] = left[bs - 1];
+ dst++;
+
+ // second column
+ for (r = 0; r < bs - 2; ++r)
+ dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
+ dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
+ dst[(bs - 1) * stride] = left[bs - 1];
+ dst++;
+
+ // rest of last row
+ for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
+
+ for (r = bs - 2; r >= 0; --r)
+ for (c = 0; c < bs - 2; ++c)
+ dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+}
+
+static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+ int size;
+ (void)left;
+ for (c = 0; c < bs; ++c) {
+ dst[c] = AVG2(above[c], above[c + 1]);
+ dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
+ }
+ for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
+ memcpy(dst + (r + 0) * stride, dst + (r >> 1), size);
+ memset(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+ memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
+ memset(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
+ }
+}
+
+static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8_t above_right = above[bs - 1];
+ const uint8_t *const dst_row0 = dst;
+ int x, size;
+ (void)left;
+
+ for (x = 0; x < bs - 1; ++x) {
+ dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
+ }
+ dst[bs - 1] = above_right;
+ dst += stride;
+ for (x = 1, size = bs - 2; x < bs; ++x, --size) {
+ memcpy(dst, dst_row0 + x, size);
+ memset(dst + size, above_right, x + 1);
+ dst += stride;
+ }
+}
+
+static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+
+ // first row
+ for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
+ dst += stride;
+
+ // second row
+ dst[0] = AVG3(left[0], above[-1], above[0]);
+ for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+ dst += stride;
+
+ // the rest of first col
+ dst[0] = AVG3(above[-1], left[0], left[1]);
+ for (r = 3; r < bs; ++r)
+ dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
+
+ // the rest of the block
+ for (r = 2; r < bs; ++r) {
+ for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
+ dst += stride;
+ }
+}
+
+static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
+ // silence a spurious -Warray-bounds warning, possibly related to:
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
+ uint8_t border[69];
+#else
+ uint8_t border[32 + 32 - 1]; // outer border from bottom-left to top-right
+#endif
+
+ // dst(bs, bs - 2)[0], i.e., border starting at bottom-left
+ for (i = 0; i < bs - 2; ++i) {
+ border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+ }
+ border[bs - 2] = AVG3(above[-1], left[0], left[1]);
+ border[bs - 1] = AVG3(left[0], above[-1], above[0]);
+ border[bs - 0] = AVG3(above[-1], above[0], above[1]);
+ // dst[0][2, size), i.e., remaining top border ascending
+ for (i = 0; i < bs - 2; ++i) {
+ border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
+ }
+
+ for (i = 0; i < bs; ++i) {
+ memcpy(dst + i * stride, border + bs - 1 - i, bs);
+ }
+}
+
+static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+ dst[0] = AVG2(above[-1], left[0]);
+ for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
+ dst++;
+
+ dst[0] = AVG3(left[0], above[-1], above[0]);
+ dst[stride] = AVG3(above[-1], left[0], left[1]);
+ for (r = 2; r < bs; r++)
+ dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+ dst++;
+
+ for (c = 0; c < bs - 2; c++)
+ dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
+ dst += stride;
+
+ for (r = 1; r < bs; ++r) {
+ for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
+ dst += stride;
+ }
+}
+
+static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r;
+ (void)left;
+
+ for (r = 0; r < bs; r++) {
+ memcpy(dst, above, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r;
+ (void)above;
+
+ for (r = 0; r < bs; r++) {
+ memset(dst, left[r], bs);
+ dst += stride;
+ }
+}
+
+static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r, c;
+ int ytop_left = above[-1];
+
+ for (r = 0; r < bs; r++) {
+ for (c = 0; c < bs; c++)
+ dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int r;
+ (void)above;
+ (void)left;
+
+ for (r = 0; r < bs; r++) {
+ memset(dst, 128, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i, r, expected_dc, sum = 0;
+ (void)above;
+
+ for (i = 0; i < bs; i++) sum += left[i];
+ expected_dc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++) {
+ memset(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int i, r, expected_dc, sum = 0;
+ (void)left;
+
+ for (i = 0; i < bs; i++) sum += above[i];
+ expected_dc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++) {
+ memset(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ int i, r, expected_dc, sum = 0;
+ const int count = 2 * bs;
+
+ for (i = 0; i < bs; i++) {
+ sum += above[i];
+ sum += left[i];
+ }
+
+ expected_dc = (sum + (count >> 1)) / count;
+
+ for (r = 0; r < bs; r++) {
+ memset(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int H = above[-1];
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+
+ memset(dst + stride * 0, AVG3(H, I, J), 4);
+ memset(dst + stride * 1, AVG3(I, J, K), 4);
+ memset(dst + stride * 2, AVG3(J, K, L), 4);
+ memset(dst + stride * 3, AVG3(K, L, L), 4);
+}
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int H = above[-1];
+ const int I = above[0];
+ const int J = above[1];
+ const int K = above[2];
+ const int L = above[3];
+ const int M = above[4];
+ (void)left;
+
+ dst[0] = AVG3(H, I, J);
+ dst[1] = AVG3(I, J, K);
+ dst[2] = AVG3(J, K, L);
+ dst[3] = AVG3(K, L, M);
+ memcpy(dst + stride * 1, dst, 4);
+ memcpy(dst + stride * 2, dst, 4);
+ memcpy(dst + stride * 3, dst, 4);
+}
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ (void)above;
+ DST(0, 0) = AVG2(I, J);
+ DST(2, 0) = DST(0, 1) = AVG2(J, K);
+ DST(2, 1) = DST(0, 2) = AVG2(K, L);
+ DST(1, 0) = AVG3(I, J, K);
+ DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+ DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+ DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ const int E = above[4];
+ const int F = above[5];
+ const int G = above[6];
+ (void)left;
+ DST(0, 0) = AVG2(A, B);
+ DST(1, 0) = DST(0, 2) = AVG2(B, C);
+ DST(2, 0) = DST(1, 2) = AVG2(C, D);
+ DST(3, 0) = DST(2, 2) = AVG2(D, E);
+ DST(3, 2) = AVG2(E, F); // differs from vp8
+
+ DST(0, 1) = AVG3(A, B, C);
+ DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+ DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+ DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+ DST(3, 3) = AVG3(E, F, G); // differs from vp8
+}
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ const int E = above[4];
+ const int F = above[5];
+ const int G = above[6];
+ const int H = above[7];
+ (void)left;
+ DST(0, 0) = AVG2(A, B);
+ DST(1, 0) = DST(0, 2) = AVG2(B, C);
+ DST(2, 0) = DST(1, 2) = AVG2(C, D);
+ DST(3, 0) = DST(2, 2) = AVG2(D, E);
+ DST(3, 2) = AVG3(E, F, G);
+
+ DST(0, 1) = AVG3(A, B, C);
+ DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+ DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+ DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+ DST(3, 3) = AVG3(F, G, H);
+}
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ const int E = above[4];
+ const int F = above[5];
+ const int G = above[6];
+ const int H = above[7];
+ (void)stride;
+ (void)left;
+ DST(0, 0) = AVG3(A, B, C);
+ DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+ DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
+ DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+ DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+ DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+ DST(3, 3) = H; // differs from vp8
+}
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ const int E = above[4];
+ const int F = above[5];
+ const int G = above[6];
+ const int H = above[7];
+ (void)stride;
+ (void)left;
+ DST(0, 0) = AVG3(A, B, C);
+ DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+ DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
+ DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+ DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+ DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+ DST(3, 3) = AVG3(G, H, H);
+}
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ DST(0, 0) = DST(1, 2) = AVG2(X, A);
+ DST(1, 0) = DST(2, 2) = AVG2(A, B);
+ DST(2, 0) = DST(3, 2) = AVG2(B, C);
+ DST(3, 0) = AVG2(C, D);
+
+ DST(0, 3) = AVG3(K, J, I);
+ DST(0, 2) = AVG3(J, I, X);
+ DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+ DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+ DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+ DST(3, 1) = AVG3(B, C, D);
+}
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ (void)stride;
+ DST(0, 3) = AVG3(J, K, L);
+ DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
+ DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
+ DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+ DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+ DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+ DST(3, 0) = AVG3(D, C, B);
+}
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+
+ DST(0, 0) = DST(2, 1) = AVG2(I, X);
+ DST(0, 1) = DST(2, 2) = AVG2(J, I);
+ DST(0, 2) = DST(2, 3) = AVG2(K, J);
+ DST(0, 3) = AVG2(L, K);
+
+ DST(3, 0) = AVG3(A, B, C);
+ DST(2, 0) = AVG3(X, A, B);
+ DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+ DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+ DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+ DST(1, 3) = AVG3(L, K, J);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ (void)above;
+ (void)bd;
+
+ // First column.
+ for (r = 0; r < bs - 1; ++r) {
+ dst[r * stride] = AVG2(left[r], left[r + 1]);
+ }
+ dst[(bs - 1) * stride] = left[bs - 1];
+ dst++;
+
+ // Second column.
+ for (r = 0; r < bs - 2; ++r) {
+ dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
+ }
+ dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
+ dst[(bs - 1) * stride] = left[bs - 1];
+ dst++;
+
+ // Rest of last row.
+ for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
+
+ for (r = bs - 2; r >= 0; --r) {
+ for (c = 0; c < bs - 2; ++c)
+ dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+ }
+}
+
+static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ int size;
+ (void)left;
+ (void)bd;
+ for (c = 0; c < bs; ++c) {
+ dst[c] = AVG2(above[c], above[c + 1]);
+ dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
+ }
+ for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
+ memcpy(dst + (r + 0) * stride, dst + (r >> 1), size * sizeof(*dst));
+ vpx_memset16(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+ memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1),
+ size * sizeof(*dst));
+ vpx_memset16(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
+ }
+}
+
+static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t above_right = above[bs - 1];
+ const uint16_t *const dst_row0 = dst;
+ int x, size;
+ (void)left;
+ (void)bd;
+
+ for (x = 0; x < bs - 1; ++x) {
+ dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
+ }
+ dst[bs - 1] = above_right;
+ dst += stride;
+ for (x = 1, size = bs - 2; x < bs; ++x, --size) {
+ memcpy(dst, dst_row0 + x, size * sizeof(*dst));
+ vpx_memset16(dst + size, above_right, x + 1);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ (void)bd;
+
+ // first row
+ for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
+ dst += stride;
+
+ // second row
+ dst[0] = AVG3(left[0], above[-1], above[0]);
+ for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+ dst += stride;
+
+ // the rest of first col
+ dst[0] = AVG3(above[-1], left[0], left[1]);
+ for (r = 3; r < bs; ++r)
+ dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
+
+ // the rest of the block
+ for (r = 2; r < bs; ++r) {
+ for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
+ // silence a spurious -Warray-bounds warning, possibly related to:
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
+ uint16_t border[69];
+#else
+ uint16_t border[32 + 32 - 1]; // outer border from bottom-left to top-right
+#endif
+ (void)bd;
+
+ // dst(bs, bs - 2)[0], i.e., border starting at bottom-left
+ for (i = 0; i < bs - 2; ++i) {
+ border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+ }
+ border[bs - 2] = AVG3(above[-1], left[0], left[1]);
+ border[bs - 1] = AVG3(left[0], above[-1], above[0]);
+ border[bs - 0] = AVG3(above[-1], above[0], above[1]);
+ // dst[0][2, size), i.e., remaining top border ascending
+ for (i = 0; i < bs - 2; ++i) {
+ border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
+ }
+
+ for (i = 0; i < bs; ++i) {
+ memcpy(dst + i * stride, border + bs - 1 - i, bs * sizeof(dst[0]));
+ }
+}
+
+static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ (void)bd;
+ dst[0] = AVG2(above[-1], left[0]);
+ for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
+ dst++;
+
+ dst[0] = AVG3(left[0], above[-1], above[0]);
+ dst[stride] = AVG3(above[-1], left[0], left[1]);
+ for (r = 2; r < bs; r++)
+ dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+ dst++;
+
+ for (c = 0; c < bs - 2; c++)
+ dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
+ dst += stride;
+
+ for (r = 1; r < bs; ++r) {
+ for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r;
+ (void)left;
+ (void)bd;
+ for (r = 0; r < bs; r++) {
+ memcpy(dst, above, bs * sizeof(uint16_t));
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r;
+ (void)above;
+ (void)bd;
+ for (r = 0; r < bs; r++) {
+ vpx_memset16(dst, left[r], bs);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ int ytop_left = above[-1];
+ (void)bd;
+
+ for (r = 0; r < bs; r++) {
+ for (c = 0; c < bs; c++)
+ dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r;
+ (void)above;
+ (void)left;
+
+ for (r = 0; r < bs; r++) {
+ vpx_memset16(dst, 128 << (bd - 8), bs);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i, r, expected_dc, sum = 0;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < bs; i++) sum += left[i];
+ expected_dc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++) {
+ vpx_memset16(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i, r, expected_dc, sum = 0;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < bs; i++) sum += above[i];
+ expected_dc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++) {
+ vpx_memset16(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i, r, expected_dc, sum = 0;
+ const int count = 2 * bs;
+ (void)bd;
+
+ for (i = 0; i < bs; i++) {
+ sum += above[i];
+ sum += left[i];
+ }
+
+ expected_dc = (sum + (count >> 1)) / count;
+
+ for (r = 0; r < bs; r++) {
+ vpx_memset16(dst, expected_dc, bs);
+ dst += stride;
+ }
+}
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ (void)above;
+ (void)bd;
+ DST(0, 0) = AVG2(I, J);
+ DST(2, 0) = DST(0, 1) = AVG2(J, K);
+ DST(2, 1) = DST(0, 2) = AVG2(K, L);
+ DST(1, 0) = AVG3(I, J, K);
+ DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+ DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+ DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd) {
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ const int E = above[4];
+ const int F = above[5];
+ const int G = above[6];
+ (void)left;
+ (void)bd;
+ DST(0, 0) = AVG2(A, B);
+ DST(1, 0) = DST(0, 2) = AVG2(B, C);
+ DST(2, 0) = DST(1, 2) = AVG2(C, D);
+ DST(3, 0) = DST(2, 2) = AVG2(D, E);
+ DST(3, 2) = AVG2(E, F); // differs from vp8
+
+ DST(0, 1) = AVG3(A, B, C);
+ DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+ DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+ DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+ DST(3, 3) = AVG3(E, F, G); // differs from vp8
+}
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd) {
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ const int E = above[4];
+ const int F = above[5];
+ const int G = above[6];
+ const int H = above[7];
+ (void)left;
+ (void)bd;
+ DST(0, 0) = AVG3(A, B, C);
+ DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+ DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
+ DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+ DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+ DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+ DST(3, 3) = H; // differs from vp8
+}
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ (void)bd;
+ DST(0, 0) = DST(1, 2) = AVG2(X, A);
+ DST(1, 0) = DST(2, 2) = AVG2(A, B);
+ DST(2, 0) = DST(3, 2) = AVG2(B, C);
+ DST(3, 0) = AVG2(C, D);
+
+ DST(0, 3) = AVG3(K, J, I);
+ DST(0, 2) = AVG3(J, I, X);
+ DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+ DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+ DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+ DST(3, 1) = AVG3(B, C, D);
+}
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ const int D = above[3];
+ (void)bd;
+ DST(0, 3) = AVG3(J, K, L);
+ DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
+ DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
+ DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+ DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+ DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+ DST(3, 0) = AVG3(D, C, B);
+}
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const int X = above[-1];
+ const int A = above[0];
+ const int B = above[1];
+ const int C = above[2];
+ (void)bd;
+
+ DST(0, 0) = DST(2, 1) = AVG2(I, X);
+ DST(0, 1) = DST(2, 2) = AVG2(J, I);
+ DST(0, 2) = DST(2, 3) = AVG2(K, J);
+ DST(0, 3) = AVG2(L, K);
+
+ DST(3, 0) = AVG3(A, B, C);
+ DST(2, 0) = AVG3(X, A, B);
+ DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+ DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+ DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+ DST(1, 3) = AVG3(L, K, J);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+// This serves as a wrapper function, so that all the prediction functions
+// can be unified and accessed as a pointer array. Note that the boundary
+// above and left are not necessarily used all the time.
+#define intra_pred_sized(type, size) \
+ void vpx_##type##_predictor_##size##x##size##_c( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ type##_predictor(dst, stride, size, above, left); \
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define intra_pred_highbd_sized(type, size) \
+ void vpx_highbd_##type##_predictor_##size##x##size##_c( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ highbd_##type##_predictor(dst, stride, size, above, left, bd); \
+ }
+
+/* clang-format off */
+#define intra_pred_allsizes(type) \
+ intra_pred_sized(type, 4) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32) \
+ intra_pred_highbd_sized(type, 4) \
+ intra_pred_highbd_sized(type, 8) \
+ intra_pred_highbd_sized(type, 16) \
+ intra_pred_highbd_sized(type, 32)
+
+#define intra_pred_no_4x4(type) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32) \
+ intra_pred_highbd_sized(type, 8) \
+ intra_pred_highbd_sized(type, 16) \
+ intra_pred_highbd_sized(type, 32)
+
+#else
+#define intra_pred_allsizes(type) \
+ intra_pred_sized(type, 4) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32)
+
+#define intra_pred_no_4x4(type) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32)
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+intra_pred_no_4x4(d207)
+intra_pred_no_4x4(d63)
+intra_pred_no_4x4(d45)
+intra_pred_no_4x4(d117)
+intra_pred_no_4x4(d135)
+intra_pred_no_4x4(d153)
+intra_pred_allsizes(v)
+intra_pred_allsizes(h)
+intra_pred_allsizes(tm)
+intra_pred_allsizes(dc_128)
+intra_pred_allsizes(dc_left)
+intra_pred_allsizes(dc_top)
+intra_pred_allsizes(dc)
+/* clang-format on */
+#undef intra_pred_allsizes
diff --git a/media/libvpx/libvpx/vpx_dsp/inv_txfm.c b/media/libvpx/libvpx/vpx_dsp/inv_txfm.c
new file mode 100644
index 0000000000..97655b3a9e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/inv_txfm.c
@@ -0,0 +1,2701 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/inv_txfm.h"
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = WRAPLOW(a1);
+ op[1] = WRAPLOW(b1);
+ op[2] = WRAPLOW(c1);
+ op[3] = WRAPLOW(d1);
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ c1 = ip[4 * 1];
+ d1 = ip[4 * 2];
+ b1 = ip[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
+ dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
+ dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
+ dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
+
+ ip++;
+ dest++;
+ }
+}
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = input;
+ tran_low_t *op = tmp;
+
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = WRAPLOW(a1);
+ op[1] = op[2] = op[3] = WRAPLOW(e1);
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
+ dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
+ dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
+ dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
+ ip++;
+ dest++;
+ }
+}
+
+void iadst4_c(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+ tran_low_t x0 = input[0];
+ tran_low_t x1 = input[1];
+ tran_low_t x2 = input[2];
+ tran_low_t x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ memset(output, 0, 4 * sizeof(*output));
+ return;
+ }
+
+ // 32-bit result is enough for the following multiplications.
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = WRAPLOW(x0 - x2 + x3);
+
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = sinpi_3_9 * s7;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
+ output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
+ output[2] = WRAPLOW(dct_const_round_shift(s2));
+ output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
+}
+
+void idct4_c(const tran_low_t *input, tran_low_t *output) {
+ int16_t step[4];
+ tran_high_t temp1, temp2;
+
+ // stage 1
+ temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64;
+ temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64;
+ step[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64;
+ temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64;
+ step[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step[3] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2
+ output[0] = WRAPLOW(step[0] + step[3]);
+ output[1] = WRAPLOW(step[1] + step[2]);
+ output[2] = WRAPLOW(step[1] - step[2]);
+ output[3] = WRAPLOW(step[0] - step[3]);
+}
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[4], temp_out[4];
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ idct4_c(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
+ idct4_c(temp_in, temp_out);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 4));
+ }
+ }
+}
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ for (i = 0; i < 4; i++) {
+ dest[0] = clip_pixel_add(dest[0], a1);
+ dest[1] = clip_pixel_add(dest[1], a1);
+ dest[2] = clip_pixel_add(dest[2], a1);
+ dest[3] = clip_pixel_add(dest[3], a1);
+ dest += stride;
+ }
+}
+
+void iadst8_c(const tran_low_t *input, tran_low_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ memset(output, 0, 8 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
+ s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
+ s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
+ s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
+ s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
+ s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
+ s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
+ s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+ x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+ x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+ x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
+
+ // stage 2
+ s0 = (int)x0;
+ s1 = (int)x1;
+ s2 = (int)x2;
+ s3 = (int)x3;
+ s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
+ s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
+ s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
+ s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
+
+ x0 = WRAPLOW(s0 + s2);
+ x1 = WRAPLOW(s1 + s3);
+ x2 = WRAPLOW(s0 - s2);
+ x3 = WRAPLOW(s1 - s3);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+
+ // stage 3
+ s2 = (int)(cospi_16_64 * (x2 + x3));
+ s3 = (int)(cospi_16_64 * (x2 - x3));
+ s6 = (int)(cospi_16_64 * (x6 + x7));
+ s7 = (int)(cospi_16_64 * (x6 - x7));
+
+ x2 = WRAPLOW(dct_const_round_shift(s2));
+ x3 = WRAPLOW(dct_const_round_shift(s3));
+ x6 = WRAPLOW(dct_const_round_shift(s6));
+ x7 = WRAPLOW(dct_const_round_shift(s7));
+
+ output[0] = WRAPLOW(x0);
+ output[1] = WRAPLOW(-x4);
+ output[2] = WRAPLOW(x6);
+ output[3] = WRAPLOW(-x2);
+ output[4] = WRAPLOW(x3);
+ output[5] = WRAPLOW(-x7);
+ output[6] = WRAPLOW(x5);
+ output[7] = WRAPLOW(-x1);
+}
+
+void idct8_c(const tran_low_t *input, tran_low_t *output) {
+ int16_t step1[8], step2[8];
+ tran_high_t temp1, temp2;
+
+ // stage 1
+ step1[0] = (int16_t)input[0];
+ step1[2] = (int16_t)input[4];
+ step1[1] = (int16_t)input[2];
+ step1[3] = (int16_t)input[6];
+ temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64;
+ temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64;
+ temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2
+ temp1 = (step1[0] + step1[2]) * cospi_16_64;
+ temp2 = (step1[0] - step1[2]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ // stage 3
+ step1[0] = WRAPLOW(step2[0] + step2[3]);
+ step1[1] = WRAPLOW(step2[1] + step2[2]);
+ step1[2] = WRAPLOW(step2[1] - step2[2]);
+ step1[3] = WRAPLOW(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ // stage 4
+ output[0] = WRAPLOW(step1[0] + step1[7]);
+ output[1] = WRAPLOW(step1[1] + step1[6]);
+ output[2] = WRAPLOW(step1[2] + step1[5]);
+ output[3] = WRAPLOW(step1[3] + step1[4]);
+ output[4] = WRAPLOW(step1[3] - step1[4]);
+ output[5] = WRAPLOW(step1[2] - step1[5]);
+ output[6] = WRAPLOW(step1[1] - step1[6]);
+ output[7] = WRAPLOW(step1[0] - step1[7]);
+}
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[8], temp_out[8];
+
+ // First transform rows
+ for (i = 0; i < 8; ++i) {
+ idct8_c(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+ idct8_c(temp_in, temp_out);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 5));
+ }
+ }
+}
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[8], temp_out[8];
+
+ // First transform rows
+ // Only first 4 row has non-zero coefs
+ for (i = 0; i < 4; ++i) {
+ idct8_c(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+ idct8_c(temp_in, temp_out);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 5));
+ }
+ }
+}
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+void iadst16_c(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+ x13 | x14 | x15)) {
+ memset(output, 0, 16 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+ x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+ x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+ x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+ x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+ x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+ x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+ x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+ x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+ x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = WRAPLOW(s0 + s4);
+ x1 = WRAPLOW(s1 + s5);
+ x2 = WRAPLOW(s2 + s6);
+ x3 = WRAPLOW(s3 + s7);
+ x4 = WRAPLOW(s0 - s4);
+ x5 = WRAPLOW(s1 - s5);
+ x6 = WRAPLOW(s2 - s6);
+ x7 = WRAPLOW(s3 - s7);
+ x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+ x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+ x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+ x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+ x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+ x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+ x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = WRAPLOW(s0 + s2);
+ x1 = WRAPLOW(s1 + s3);
+ x2 = WRAPLOW(s0 - s2);
+ x3 = WRAPLOW(s1 - s3);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+ x8 = WRAPLOW(s8 + s10);
+ x9 = WRAPLOW(s9 + s11);
+ x10 = WRAPLOW(s8 - s10);
+ x11 = WRAPLOW(s9 - s11);
+ x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+ x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+ x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
+
+ // stage 4
+ s2 = (-cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (-cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = WRAPLOW(dct_const_round_shift(s2));
+ x3 = WRAPLOW(dct_const_round_shift(s3));
+ x6 = WRAPLOW(dct_const_round_shift(s6));
+ x7 = WRAPLOW(dct_const_round_shift(s7));
+ x10 = WRAPLOW(dct_const_round_shift(s10));
+ x11 = WRAPLOW(dct_const_round_shift(s11));
+ x14 = WRAPLOW(dct_const_round_shift(s14));
+ x15 = WRAPLOW(dct_const_round_shift(s15));
+
+ output[0] = WRAPLOW(x0);
+ output[1] = WRAPLOW(-x8);
+ output[2] = WRAPLOW(x12);
+ output[3] = WRAPLOW(-x4);
+ output[4] = WRAPLOW(x6);
+ output[5] = WRAPLOW(x14);
+ output[6] = WRAPLOW(x10);
+ output[7] = WRAPLOW(x2);
+ output[8] = WRAPLOW(x3);
+ output[9] = WRAPLOW(x11);
+ output[10] = WRAPLOW(x15);
+ output[11] = WRAPLOW(x7);
+ output[12] = WRAPLOW(x5);
+ output[13] = WRAPLOW(-x13);
+ output[14] = WRAPLOW(x9);
+ output[15] = WRAPLOW(-x1);
+}
+
+void idct16_c(const tran_low_t *input, tran_low_t *output) {
+ int16_t step1[16], step2[16];
+ tran_high_t temp1, temp2;
+
+ // stage 1
+ step1[0] = (int16_t)input[0 / 2];
+ step1[1] = (int16_t)input[16 / 2];
+ step1[2] = (int16_t)input[8 / 2];
+ step1[3] = (int16_t)input[24 / 2];
+ step1[4] = (int16_t)input[4 / 2];
+ step1[5] = (int16_t)input[20 / 2];
+ step1[6] = (int16_t)input[12 / 2];
+ step1[7] = (int16_t)input[28 / 2];
+ step1[8] = (int16_t)input[2 / 2];
+ step1[9] = (int16_t)input[18 / 2];
+ step1[10] = (int16_t)input[10 / 2];
+ step1[11] = (int16_t)input[26 / 2];
+ step1[12] = (int16_t)input[6 / 2];
+ step1[13] = (int16_t)input[22 / 2];
+ step1[14] = (int16_t)input[14 / 2];
+ step1[15] = (int16_t)input[30 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step1[8] = WRAPLOW(step2[8] + step2[9]);
+ step1[9] = WRAPLOW(step2[8] - step2[9]);
+ step1[10] = WRAPLOW(-step2[10] + step2[11]);
+ step1[11] = WRAPLOW(step2[10] + step2[11]);
+ step1[12] = WRAPLOW(step2[12] + step2[13]);
+ step1[13] = WRAPLOW(step2[12] - step2[13]);
+ step1[14] = WRAPLOW(-step2[14] + step2[15]);
+ step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3]);
+ step1[1] = WRAPLOW(step2[1] + step2[2]);
+ step1[2] = WRAPLOW(step2[1] - step2[2]);
+ step1[3] = WRAPLOW(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11]);
+ step1[9] = WRAPLOW(step2[9] + step2[10]);
+ step1[10] = WRAPLOW(step2[9] - step2[10]);
+ step1[11] = WRAPLOW(step2[8] - step2[11]);
+ step1[12] = WRAPLOW(-step2[12] + step2[15]);
+ step1[13] = WRAPLOW(-step2[13] + step2[14]);
+ step1[14] = WRAPLOW(step2[13] + step2[14]);
+ step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7]);
+ step2[1] = WRAPLOW(step1[1] + step1[6]);
+ step2[2] = WRAPLOW(step1[2] + step1[5]);
+ step2[3] = WRAPLOW(step1[3] + step1[4]);
+ step2[4] = WRAPLOW(step1[3] - step1[4]);
+ step2[5] = WRAPLOW(step1[2] - step1[5]);
+ step2[6] = WRAPLOW(step1[1] - step1[6]);
+ step2[7] = WRAPLOW(step1[0] - step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ output[0] = (tran_low_t)WRAPLOW(step2[0] + step2[15]);
+ output[1] = (tran_low_t)WRAPLOW(step2[1] + step2[14]);
+ output[2] = (tran_low_t)WRAPLOW(step2[2] + step2[13]);
+ output[3] = (tran_low_t)WRAPLOW(step2[3] + step2[12]);
+ output[4] = (tran_low_t)WRAPLOW(step2[4] + step2[11]);
+ output[5] = (tran_low_t)WRAPLOW(step2[5] + step2[10]);
+ output[6] = (tran_low_t)WRAPLOW(step2[6] + step2[9]);
+ output[7] = (tran_low_t)WRAPLOW(step2[7] + step2[8]);
+ output[8] = (tran_low_t)WRAPLOW(step2[7] - step2[8]);
+ output[9] = (tran_low_t)WRAPLOW(step2[6] - step2[9]);
+ output[10] = (tran_low_t)WRAPLOW(step2[5] - step2[10]);
+ output[11] = (tran_low_t)WRAPLOW(step2[4] - step2[11]);
+ output[12] = (tran_low_t)WRAPLOW(step2[3] - step2[12]);
+ output[13] = (tran_low_t)WRAPLOW(step2[2] - step2[13]);
+ output[14] = (tran_low_t)WRAPLOW(step2[1] - step2[14]);
+ output[15] = (tran_low_t)WRAPLOW(step2[0] - step2[15]);
+}
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows
+ for (i = 0; i < 16; ++i) {
+ idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 8x8 area, we only need to calculate first 8 rows here.
+ for (i = 0; i < 8; ++i) {
+ idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ for (i = 0; i < 4; ++i) {
+ idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+void idct32_c(const tran_low_t *input, tran_low_t *output) {
+ int16_t step1[32], step2[32];
+ tran_high_t temp1, temp2;
+
+ // stage 1
+ step1[0] = (int16_t)input[0];
+ step1[1] = (int16_t)input[16];
+ step1[2] = (int16_t)input[8];
+ step1[3] = (int16_t)input[24];
+ step1[4] = (int16_t)input[4];
+ step1[5] = (int16_t)input[20];
+ step1[6] = (int16_t)input[12];
+ step1[7] = (int16_t)input[28];
+ step1[8] = (int16_t)input[2];
+ step1[9] = (int16_t)input[18];
+ step1[10] = (int16_t)input[10];
+ step1[11] = (int16_t)input[26];
+ step1[12] = (int16_t)input[6];
+ step1[13] = (int16_t)input[22];
+ step1[14] = (int16_t)input[14];
+ step1[15] = (int16_t)input[30];
+
+ temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64;
+ temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64;
+ step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[31] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64;
+ temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64;
+ temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64;
+ temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64;
+ temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64;
+ temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64;
+ temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64;
+ temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step2[16] = WRAPLOW(step1[16] + step1[17]);
+ step2[17] = WRAPLOW(step1[16] - step1[17]);
+ step2[18] = WRAPLOW(-step1[18] + step1[19]);
+ step2[19] = WRAPLOW(step1[18] + step1[19]);
+ step2[20] = WRAPLOW(step1[20] + step1[21]);
+ step2[21] = WRAPLOW(step1[20] - step1[21]);
+ step2[22] = WRAPLOW(-step1[22] + step1[23]);
+ step2[23] = WRAPLOW(step1[22] + step1[23]);
+ step2[24] = WRAPLOW(step1[24] + step1[25]);
+ step2[25] = WRAPLOW(step1[24] - step1[25]);
+ step2[26] = WRAPLOW(-step1[26] + step1[27]);
+ step2[27] = WRAPLOW(step1[26] + step1[27]);
+ step2[28] = WRAPLOW(step1[28] + step1[29]);
+ step2[29] = WRAPLOW(step1[28] - step1[29]);
+ step2[30] = WRAPLOW(-step1[30] + step1[31]);
+ step2[31] = WRAPLOW(step1[30] + step1[31]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step1[8] = WRAPLOW(step2[8] + step2[9]);
+ step1[9] = WRAPLOW(step2[8] - step2[9]);
+ step1[10] = WRAPLOW(-step2[10] + step2[11]);
+ step1[11] = WRAPLOW(step2[10] + step2[11]);
+ step1[12] = WRAPLOW(step2[12] + step2[13]);
+ step1[13] = WRAPLOW(step2[12] - step2[13]);
+ step1[14] = WRAPLOW(-step2[14] + step2[15]);
+ step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+ temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+ temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+ temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+ temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ step2[16] = WRAPLOW(step1[16] + step1[19]);
+ step2[17] = WRAPLOW(step1[17] + step1[18]);
+ step2[18] = WRAPLOW(step1[17] - step1[18]);
+ step2[19] = WRAPLOW(step1[16] - step1[19]);
+ step2[20] = WRAPLOW(-step1[20] + step1[23]);
+ step2[21] = WRAPLOW(-step1[21] + step1[22]);
+ step2[22] = WRAPLOW(step1[21] + step1[22]);
+ step2[23] = WRAPLOW(step1[20] + step1[23]);
+
+ step2[24] = WRAPLOW(step1[24] + step1[27]);
+ step2[25] = WRAPLOW(step1[25] + step1[26]);
+ step2[26] = WRAPLOW(step1[25] - step1[26]);
+ step2[27] = WRAPLOW(step1[24] - step1[27]);
+ step2[28] = WRAPLOW(-step1[28] + step1[31]);
+ step2[29] = WRAPLOW(-step1[29] + step1[30]);
+ step2[30] = WRAPLOW(step1[29] + step1[30]);
+ step2[31] = WRAPLOW(step1[28] + step1[31]);
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3]);
+ step1[1] = WRAPLOW(step2[1] + step2[2]);
+ step1[2] = WRAPLOW(step2[1] - step2[2]);
+ step1[3] = WRAPLOW(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11]);
+ step1[9] = WRAPLOW(step2[9] + step2[10]);
+ step1[10] = WRAPLOW(step2[9] - step2[10]);
+ step1[11] = WRAPLOW(step2[8] - step2[11]);
+ step1[12] = WRAPLOW(-step2[12] + step2[15]);
+ step1[13] = WRAPLOW(-step2[13] + step2[14]);
+ step1[14] = WRAPLOW(step2[13] + step2[14]);
+ step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+ temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+ temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+ temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+ temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7]);
+ step2[1] = WRAPLOW(step1[1] + step1[6]);
+ step2[2] = WRAPLOW(step1[2] + step1[5]);
+ step2[3] = WRAPLOW(step1[3] + step1[4]);
+ step2[4] = WRAPLOW(step1[3] - step1[4]);
+ step2[5] = WRAPLOW(step1[2] - step1[5]);
+ step2[6] = WRAPLOW(step1[1] - step1[6]);
+ step2[7] = WRAPLOW(step1[0] - step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ step2[16] = WRAPLOW(step1[16] + step1[23]);
+ step2[17] = WRAPLOW(step1[17] + step1[22]);
+ step2[18] = WRAPLOW(step1[18] + step1[21]);
+ step2[19] = WRAPLOW(step1[19] + step1[20]);
+ step2[20] = WRAPLOW(step1[19] - step1[20]);
+ step2[21] = WRAPLOW(step1[18] - step1[21]);
+ step2[22] = WRAPLOW(step1[17] - step1[22]);
+ step2[23] = WRAPLOW(step1[16] - step1[23]);
+
+ step2[24] = WRAPLOW(-step1[24] + step1[31]);
+ step2[25] = WRAPLOW(-step1[25] + step1[30]);
+ step2[26] = WRAPLOW(-step1[26] + step1[29]);
+ step2[27] = WRAPLOW(-step1[27] + step1[28]);
+ step2[28] = WRAPLOW(step1[27] + step1[28]);
+ step2[29] = WRAPLOW(step1[26] + step1[29]);
+ step2[30] = WRAPLOW(step1[25] + step1[30]);
+ step2[31] = WRAPLOW(step1[24] + step1[31]);
+
+ // stage 7
+ step1[0] = WRAPLOW(step2[0] + step2[15]);
+ step1[1] = WRAPLOW(step2[1] + step2[14]);
+ step1[2] = WRAPLOW(step2[2] + step2[13]);
+ step1[3] = WRAPLOW(step2[3] + step2[12]);
+ step1[4] = WRAPLOW(step2[4] + step2[11]);
+ step1[5] = WRAPLOW(step2[5] + step2[10]);
+ step1[6] = WRAPLOW(step2[6] + step2[9]);
+ step1[7] = WRAPLOW(step2[7] + step2[8]);
+ step1[8] = WRAPLOW(step2[7] - step2[8]);
+ step1[9] = WRAPLOW(step2[6] - step2[9]);
+ step1[10] = WRAPLOW(step2[5] - step2[10]);
+ step1[11] = WRAPLOW(step2[4] - step2[11]);
+ step1[12] = WRAPLOW(step2[3] - step2[12]);
+ step1[13] = WRAPLOW(step2[2] - step2[13]);
+ step1[14] = WRAPLOW(step2[1] - step2[14]);
+ step1[15] = WRAPLOW(step2[0] - step2[15]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+ temp2 = (step2[20] + step2[27]) * cospi_16_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+ temp2 = (step2[21] + step2[26]) * cospi_16_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+ temp2 = (step2[22] + step2[25]) * cospi_16_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+ temp2 = (step2[23] + step2[24]) * cospi_16_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // final stage
+ output[0] = WRAPLOW(step1[0] + step1[31]);
+ output[1] = WRAPLOW(step1[1] + step1[30]);
+ output[2] = WRAPLOW(step1[2] + step1[29]);
+ output[3] = WRAPLOW(step1[3] + step1[28]);
+ output[4] = WRAPLOW(step1[4] + step1[27]);
+ output[5] = WRAPLOW(step1[5] + step1[26]);
+ output[6] = WRAPLOW(step1[6] + step1[25]);
+ output[7] = WRAPLOW(step1[7] + step1[24]);
+ output[8] = WRAPLOW(step1[8] + step1[23]);
+ output[9] = WRAPLOW(step1[9] + step1[22]);
+ output[10] = WRAPLOW(step1[10] + step1[21]);
+ output[11] = WRAPLOW(step1[11] + step1[20]);
+ output[12] = WRAPLOW(step1[12] + step1[19]);
+ output[13] = WRAPLOW(step1[13] + step1[18]);
+ output[14] = WRAPLOW(step1[14] + step1[17]);
+ output[15] = WRAPLOW(step1[15] + step1[16]);
+ output[16] = WRAPLOW(step1[15] - step1[16]);
+ output[17] = WRAPLOW(step1[14] - step1[17]);
+ output[18] = WRAPLOW(step1[13] - step1[18]);
+ output[19] = WRAPLOW(step1[12] - step1[19]);
+ output[20] = WRAPLOW(step1[11] - step1[20]);
+ output[21] = WRAPLOW(step1[10] - step1[21]);
+ output[22] = WRAPLOW(step1[9] - step1[22]);
+ output[23] = WRAPLOW(step1[8] - step1[23]);
+ output[24] = WRAPLOW(step1[7] - step1[24]);
+ output[25] = WRAPLOW(step1[6] - step1[25]);
+ output[26] = WRAPLOW(step1[5] - step1[26]);
+ output[27] = WRAPLOW(step1[4] - step1[27]);
+ output[28] = WRAPLOW(step1[3] - step1[28]);
+ output[29] = WRAPLOW(step1[2] - step1[29]);
+ output[30] = WRAPLOW(step1[1] - step1[30]);
+ output[31] = WRAPLOW(step1[0] - step1[31]);
+}
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_low_t out[32 * 32];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ int16_t zero_coeff = 0;
+ for (j = 0; j < 32; ++j) zero_coeff |= input[j];
+
+ if (zero_coeff)
+ idct32_c(input, outptr);
+ else
+ memset(outptr, 0, sizeof(tran_low_t) * 32);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
+ idct32_c(temp_in, temp_out);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_low_t out[32 * 32] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ // Only upper-left 16x16 has non-zero coeff
+ for (i = 0; i < 16; ++i) {
+ idct32_c(input, outptr);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
+ idct32_c(temp_in, temp_out);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_low_t out[32 * 32] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ // Only upper-left 8x8 has non-zero coeff
+ for (i = 0; i < 8; ++i) {
+ idct32_c(input, outptr);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
+ idct32_c(temp_in, temp_out);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
+ dest += stride;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
+// transform amplify bits + 1 bit for contingency in rounding and quantizing
+#define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
+
+static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
+ int size) {
+ int i;
+ for (i = 0; i < size; ++i)
+ if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
+ return 0;
+}
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = HIGHBD_WRAPLOW(a1, bd);
+ op[1] = HIGHBD_WRAPLOW(b1, bd);
+ op[2] = HIGHBD_WRAPLOW(c1, bd);
+ op[3] = HIGHBD_WRAPLOW(d1, bd);
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ c1 = ip[4 * 1];
+ d1 = ip[4 * 2];
+ b1 = ip[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] =
+ highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
+ dest[stride * 1] =
+ highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
+ dest[stride * 2] =
+ highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
+ dest[stride * 3] =
+ highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
+
+ ip++;
+ dest++;
+ }
+}
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = input;
+ tran_low_t *op = tmp;
+ (void)bd;
+
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = HIGHBD_WRAPLOW(a1, bd);
+ op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+ dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
+ dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
+ dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
+ ip++;
+ dest++;
+ }
+}
+
+void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+ tran_low_t x0 = input[0];
+ tran_low_t x1 = input[1];
+ tran_low_t x2 = input[2];
+ tran_low_t x3 = input[3];
+ (void)bd;
+
+ if (detect_invalid_highbd_input(input, 4)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd txfm input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ memset(output, 0, sizeof(*output) * 4);
+ return;
+ }
+
+ if (!(x0 | x1 | x2 | x3)) {
+ memset(output, 0, 4 * sizeof(*output));
+ return;
+ }
+
+ s0 = (tran_high_t)sinpi_1_9 * x0;
+ s1 = (tran_high_t)sinpi_2_9 * x0;
+ s2 = (tran_high_t)sinpi_3_9 * x1;
+ s3 = (tran_high_t)sinpi_4_9 * x2;
+ s4 = (tran_high_t)sinpi_1_9 * x2;
+ s5 = (tran_high_t)sinpi_2_9 * x3;
+ s6 = (tran_high_t)sinpi_4_9 * x3;
+ s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
+
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = sinpi_3_9 * s7;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
+ output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
+ output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+ output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
+}
+
+void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step[4];
+ tran_high_t temp1, temp2;
+ (void)bd;
+
+ if (detect_invalid_highbd_input(input, 4)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd txfm input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ memset(output, 0, sizeof(*output) * 4);
+ return;
+ }
+
+ // stage 1
+ temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64;
+ temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64;
+ step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 =
+ input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64;
+ temp2 =
+ input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64;
+ step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ // stage 2
+ output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
+ output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
+ output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
+ output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
+}
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[4], temp_out[4];
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ vpx_highbd_idct4_c(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
+ vpx_highbd_idct4_c(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+ }
+}
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ tran_high_t a1;
+ tran_low_t out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ for (i = 0; i < 4; i++) {
+ dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
+ dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
+ dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
+ dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
+ dest += stride;
+ }
+}
+
+void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+ tran_low_t x0 = input[7];
+ tran_low_t x1 = input[0];
+ tran_low_t x2 = input[5];
+ tran_low_t x3 = input[2];
+ tran_low_t x4 = input[3];
+ tran_low_t x5 = input[4];
+ tran_low_t x6 = input[1];
+ tran_low_t x7 = input[6];
+ (void)bd;
+
+ if (detect_invalid_highbd_input(input, 8)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd txfm input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ memset(output, 0, sizeof(*output) * 8);
+ return;
+ }
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ memset(output, 0, 8 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1;
+ s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1;
+ s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3;
+ s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3;
+ s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5;
+ s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5;
+ s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7;
+ s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7;
+
+ x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
+ x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
+ x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
+ x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
+ x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
+ x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5;
+ s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5;
+ s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7;
+ s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7;
+
+ x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+ x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+ x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+ x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+ x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+ x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+
+ // stage 3
+ s2 = (tran_high_t)cospi_16_64 * (x2 + x3);
+ s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
+ s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
+ s7 = (tran_high_t)cospi_16_64 * (x6 - x7);
+
+ x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+ x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
+
+ output[0] = HIGHBD_WRAPLOW(x0, bd);
+ output[1] = HIGHBD_WRAPLOW(-x4, bd);
+ output[2] = HIGHBD_WRAPLOW(x6, bd);
+ output[3] = HIGHBD_WRAPLOW(-x2, bd);
+ output[4] = HIGHBD_WRAPLOW(x3, bd);
+ output[5] = HIGHBD_WRAPLOW(-x7, bd);
+ output[6] = HIGHBD_WRAPLOW(x5, bd);
+ output[7] = HIGHBD_WRAPLOW(-x1, bd);
+}
+
+void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[8], step2[8];
+ tran_high_t temp1, temp2;
+
+ if (detect_invalid_highbd_input(input, 8)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd txfm input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ memset(output, 0, sizeof(*output) * 8);
+ return;
+ }
+
+ // stage 1
+ step1[0] = input[0];
+ step1[2] = input[4];
+ step1[1] = input[2];
+ step1[3] = input[6];
+ temp1 =
+ input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64;
+ temp2 =
+ input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64;
+ step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 =
+ input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64;
+ temp2 =
+ input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64;
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ // stage 2 & stage 3 - even half
+ vpx_highbd_idct4_c(step1, step1, bd);
+
+ // stage 2 - odd half
+ step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+ step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+ step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+ step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
+
+ // stage 3 - odd half
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[7] = step2[7];
+
+ // stage 4
+ output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+ output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+ output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+ output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+ output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+ output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+ output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+ output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
+}
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[8], temp_out[8];
+
+ // First transform rows
+ for (i = 0; i < 8; ++i) {
+ vpx_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+ vpx_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+}
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[8], temp_out[8];
+
+ // First transform rows
+ // Only first 4 row has non-zero coefs
+ for (i = 0; i < 4; ++i) {
+ vpx_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+ vpx_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+}
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+ tran_low_t x0 = input[15];
+ tran_low_t x1 = input[0];
+ tran_low_t x2 = input[13];
+ tran_low_t x3 = input[2];
+ tran_low_t x4 = input[11];
+ tran_low_t x5 = input[4];
+ tran_low_t x6 = input[9];
+ tran_low_t x7 = input[6];
+ tran_low_t x8 = input[7];
+ tran_low_t x9 = input[8];
+ tran_low_t x10 = input[5];
+ tran_low_t x11 = input[10];
+ tran_low_t x12 = input[3];
+ tran_low_t x13 = input[12];
+ tran_low_t x14 = input[1];
+ tran_low_t x15 = input[14];
+ (void)bd;
+
+ if (detect_invalid_highbd_input(input, 16)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd txfm input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ memset(output, 0, sizeof(*output) * 16);
+ return;
+ }
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+ x13 | x14 | x15)) {
+ memset(output, 0, 16 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64;
+ s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64;
+ s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64;
+ s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64;
+ s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64;
+ s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64;
+ s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64;
+ s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64;
+ s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64;
+ s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64;
+ s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64;
+ s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64;
+ s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64;
+ s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64;
+ s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64;
+ s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64;
+
+ x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
+ x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
+ x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
+ x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
+ x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
+ x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
+ x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
+ x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
+ x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
+ x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
+ x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
+ x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64;
+ s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64;
+ s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64;
+ s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64;
+ s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64;
+ s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64;
+ s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64;
+ s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64;
+
+ x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
+ x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
+ x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
+ x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
+ x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
+ x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
+ x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
+ x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
+ x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
+ x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
+ x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
+ x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
+ x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
+ x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64;
+ s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64;
+ s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64;
+ s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64;
+ s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64;
+ s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64;
+ s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64;
+
+ x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+ x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+ x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+ x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+ x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+ x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+ x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
+ x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
+ x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
+ x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
+ x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
+ x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
+
+ // stage 4
+ s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3);
+ s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
+ s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
+ s7 = (tran_high_t)cospi_16_64 * (-x6 + x7);
+ s10 = (tran_high_t)cospi_16_64 * (x10 + x11);
+ s11 = (tran_high_t)cospi_16_64 * (-x10 + x11);
+ s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15);
+ s15 = (tran_high_t)cospi_16_64 * (x14 - x15);
+
+ x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+ x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
+ x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
+ x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
+
+ output[0] = HIGHBD_WRAPLOW(x0, bd);
+ output[1] = HIGHBD_WRAPLOW(-x8, bd);
+ output[2] = HIGHBD_WRAPLOW(x12, bd);
+ output[3] = HIGHBD_WRAPLOW(-x4, bd);
+ output[4] = HIGHBD_WRAPLOW(x6, bd);
+ output[5] = HIGHBD_WRAPLOW(x14, bd);
+ output[6] = HIGHBD_WRAPLOW(x10, bd);
+ output[7] = HIGHBD_WRAPLOW(x2, bd);
+ output[8] = HIGHBD_WRAPLOW(x3, bd);
+ output[9] = HIGHBD_WRAPLOW(x11, bd);
+ output[10] = HIGHBD_WRAPLOW(x15, bd);
+ output[11] = HIGHBD_WRAPLOW(x7, bd);
+ output[12] = HIGHBD_WRAPLOW(x5, bd);
+ output[13] = HIGHBD_WRAPLOW(-x13, bd);
+ output[14] = HIGHBD_WRAPLOW(x9, bd);
+ output[15] = HIGHBD_WRAPLOW(-x1, bd);
+}
+
+void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[16], step2[16];
+ tran_high_t temp1, temp2;
+ (void)bd;
+
+ if (detect_invalid_highbd_input(input, 16)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd txfm input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ memset(output, 0, sizeof(*output) * 16);
+ return;
+ }
+
+ // stage 1
+ step1[0] = input[0 / 2];
+ step1[1] = input[16 / 2];
+ step1[2] = input[8 / 2];
+ step1[3] = input[24 / 2];
+ step1[4] = input[4 / 2];
+ step1[5] = input[20 / 2];
+ step1[6] = input[12 / 2];
+ step1[7] = input[28 / 2];
+ step1[8] = input[2 / 2];
+ step1[9] = input[18 / 2];
+ step1[10] = input[10 / 2];
+ step1[11] = input[26 / 2];
+ step1[12] = input[6 / 2];
+ step1[13] = input[22 / 2];
+ step1[14] = input[14 / 2];
+ step1[15] = input[30 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 =
+ step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
+ temp2 =
+ step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
+ step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 = step1[9] * (tran_high_t)cospi_14_64 -
+ step1[14] * (tran_high_t)cospi_18_64;
+ temp2 = step1[9] * (tran_high_t)cospi_18_64 +
+ step1[14] * (tran_high_t)cospi_14_64;
+ step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 = step1[10] * (tran_high_t)cospi_22_64 -
+ step1[13] * (tran_high_t)cospi_10_64;
+ temp2 = step1[10] * (tran_high_t)cospi_10_64 +
+ step1[13] * (tran_high_t)cospi_22_64;
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 = step1[11] * (tran_high_t)cospi_6_64 -
+ step1[12] * (tran_high_t)cospi_26_64;
+ temp2 = step1[11] * (tran_high_t)cospi_26_64 +
+ step1[12] * (tran_high_t)cospi_6_64;
+ step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 =
+ step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
+ temp2 =
+ step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
+ step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 =
+ step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
+ temp2 =
+ step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+ step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+ step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+ step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+ step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+ step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+ step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+ step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
+ step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 =
+ step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
+ temp2 =
+ step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
+ step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+ step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+ step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+ step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
+ step1[14] * (tran_high_t)cospi_24_64;
+ temp2 =
+ step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
+ step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
+ step1[13] * (tran_high_t)cospi_8_64;
+ temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
+ step1[13] * (tran_high_t)cospi_24_64;
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+ step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+ step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+ step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[7] = step2[7];
+
+ step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+ step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+ step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+ step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+ step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+ step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+ step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+ step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
+
+ // stage 6
+ step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+ step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+ step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+ step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+ step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+ step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+ step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+ step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+ step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+ output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+ output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+ output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+ output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+ output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+ output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+ output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+ output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+ output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+ output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+ output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+ output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+ output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+ output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+ output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
+}
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows
+ for (i = 0; i < 16; ++i) {
+ vpx_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ vpx_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 8x8 area, we only need to calculate first 8 rows here.
+ for (i = 0; i < 8; ++i) {
+ vpx_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ uint16_t *destT = dest;
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ vpx_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ destT[i] = highbd_clip_pixel_add(destT[i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ destT += stride;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ for (i = 0; i < 4; ++i) {
+ vpx_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ vpx_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
+ int bd) {
+ tran_low_t step1[32], step2[32];
+ tran_high_t temp1, temp2;
+ (void)bd;
+
+ if (detect_invalid_highbd_input(input, 32)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd txfm input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ memset(output, 0, sizeof(*output) * 32);
+ return;
+ }
+
+ // stage 1
+ step1[0] = input[0];
+ step1[1] = input[16];
+ step1[2] = input[8];
+ step1[3] = input[24];
+ step1[4] = input[4];
+ step1[5] = input[20];
+ step1[6] = input[12];
+ step1[7] = input[28];
+ step1[8] = input[2];
+ step1[9] = input[18];
+ step1[10] = input[10];
+ step1[11] = input[26];
+ step1[12] = input[6];
+ step1[13] = input[22];
+ step1[14] = input[14];
+ step1[15] = input[30];
+
+ temp1 =
+ input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64;
+ temp2 =
+ input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64;
+ step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 = input[17] * (tran_high_t)cospi_15_64 -
+ input[15] * (tran_high_t)cospi_17_64;
+ temp2 = input[17] * (tran_high_t)cospi_17_64 +
+ input[15] * (tran_high_t)cospi_15_64;
+ step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 =
+ input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64;
+ temp2 =
+ input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64;
+ step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 =
+ input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64;
+ temp2 =
+ input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64;
+ step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 =
+ input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64;
+ temp2 =
+ input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64;
+ step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 = input[21] * (tran_high_t)cospi_11_64 -
+ input[11] * (tran_high_t)cospi_21_64;
+ temp2 = input[21] * (tran_high_t)cospi_21_64 +
+ input[11] * (tran_high_t)cospi_11_64;
+ step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 = input[13] * (tran_high_t)cospi_19_64 -
+ input[19] * (tran_high_t)cospi_13_64;
+ temp2 = input[13] * (tran_high_t)cospi_13_64 +
+ input[19] * (tran_high_t)cospi_19_64;
+ step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 =
+ input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64;
+ temp2 =
+ input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64;
+ step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 =
+ step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
+ temp2 =
+ step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
+ step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 = step1[9] * (tran_high_t)cospi_14_64 -
+ step1[14] * (tran_high_t)cospi_18_64;
+ temp2 = step1[9] * (tran_high_t)cospi_18_64 +
+ step1[14] * (tran_high_t)cospi_14_64;
+ step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 = step1[10] * (tran_high_t)cospi_22_64 -
+ step1[13] * (tran_high_t)cospi_10_64;
+ temp2 = step1[10] * (tran_high_t)cospi_10_64 +
+ step1[13] * (tran_high_t)cospi_22_64;
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ temp1 = step1[11] * (tran_high_t)cospi_6_64 -
+ step1[12] * (tran_high_t)cospi_26_64;
+ temp2 = step1[11] * (tran_high_t)cospi_26_64 +
+ step1[12] * (tran_high_t)cospi_6_64;
+ step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
+ step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
+ step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
+ step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
+ step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
+ step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
+ step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
+ step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
+ step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
+ step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
+ step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
+ step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
+ step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
+ step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
+ step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
+ step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 =
+ step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
+ temp2 =
+ step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
+ step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 =
+ step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
+ temp2 =
+ step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+ step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+ step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+ step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+ step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+ step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+ step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+ step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+ step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ temp1 = -step2[17] * (tran_high_t)cospi_4_64 +
+ step2[30] * (tran_high_t)cospi_28_64;
+ temp2 = step2[17] * (tran_high_t)cospi_28_64 +
+ step2[30] * (tran_high_t)cospi_4_64;
+ step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = -step2[18] * (tran_high_t)cospi_28_64 -
+ step2[29] * (tran_high_t)cospi_4_64;
+ temp2 = -step2[18] * (tran_high_t)cospi_4_64 +
+ step2[29] * (tran_high_t)cospi_28_64;
+ step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ temp1 = -step2[21] * (tran_high_t)cospi_20_64 +
+ step2[26] * (tran_high_t)cospi_12_64;
+ temp2 = step2[21] * (tran_high_t)cospi_12_64 +
+ step2[26] * (tran_high_t)cospi_20_64;
+ step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = -step2[22] * (tran_high_t)cospi_12_64 -
+ step2[25] * (tran_high_t)cospi_20_64;
+ temp2 = -step2[22] * (tran_high_t)cospi_20_64 +
+ step2[25] * (tran_high_t)cospi_12_64;
+ step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
+ step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 =
+ step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
+ temp2 =
+ step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
+ step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+ step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+ step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+ step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
+ step1[14] * (tran_high_t)cospi_24_64;
+ temp2 =
+ step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
+ step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
+ step1[13] * (tran_high_t)cospi_8_64;
+ temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
+ step1[13] * (tran_high_t)cospi_24_64;
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
+ step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
+ step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
+ step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
+ step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
+ step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
+ step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
+ step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
+
+ step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
+ step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
+ step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
+ step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
+ step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
+ step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
+ step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
+ step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
+
+ // stage 5
+ step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+ step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+ step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+ step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[7] = step2[7];
+
+ step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+ step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+ step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+ step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+ step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+ step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+ step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+ step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ temp1 = -step2[18] * (tran_high_t)cospi_8_64 +
+ step2[29] * (tran_high_t)cospi_24_64;
+ temp2 = step2[18] * (tran_high_t)cospi_24_64 +
+ step2[29] * (tran_high_t)cospi_8_64;
+ step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = -step2[19] * (tran_high_t)cospi_8_64 +
+ step2[28] * (tran_high_t)cospi_24_64;
+ temp2 = step2[19] * (tran_high_t)cospi_24_64 +
+ step2[28] * (tran_high_t)cospi_8_64;
+ step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = -step2[20] * (tran_high_t)cospi_24_64 -
+ step2[27] * (tran_high_t)cospi_8_64;
+ temp2 = -step2[20] * (tran_high_t)cospi_8_64 +
+ step2[27] * (tran_high_t)cospi_24_64;
+ step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = -step2[21] * (tran_high_t)cospi_24_64 -
+ step2[26] * (tran_high_t)cospi_8_64;
+ temp2 = -step2[21] * (tran_high_t)cospi_8_64 +
+ step2[26] * (tran_high_t)cospi_24_64;
+ step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+ step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+ step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+ step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+ step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+ step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+ step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+ step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+ step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
+ step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
+ step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
+ step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
+ step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
+ step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
+ step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
+ step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
+
+ step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
+ step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
+ step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
+ step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
+ step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
+ step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
+ step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
+ step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
+
+ // stage 7
+ step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+ step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+ step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+ step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+ step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+ step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+ step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+ step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+ step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+ step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+ step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+ step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+ step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+ step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+ step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+ step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
+ step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
+ step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
+ step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
+ temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
+ step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // final stage
+ output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
+ output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
+ output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
+ output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
+ output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
+ output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
+ output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
+ output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
+ output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
+ output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
+ output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
+ output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
+ output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
+ output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
+ output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
+ output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
+ output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
+ output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
+ output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
+ output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
+ output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
+ output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
+ output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
+ output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
+ output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
+ output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
+ output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
+ output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
+ output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
+ output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
+ output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
+ output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
+}
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ tran_low_t out[32 * 32];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ tran_low_t zero_coeff = 0;
+ for (j = 0; j < 32; ++j) zero_coeff |= input[j];
+
+ if (zero_coeff)
+ highbd_idct32_c(input, outptr, bd);
+ else
+ memset(outptr, 0, sizeof(tran_low_t) * 32);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
+ highbd_idct32_c(temp_in, temp_out, bd);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+}
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ tran_low_t out[32 * 32] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ // Only upper-left 16x16 has non-zero coeff
+ for (i = 0; i < 16; ++i) {
+ highbd_idct32_c(input, outptr, bd);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ uint16_t *destT = dest;
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
+ highbd_idct32_c(temp_in, temp_out, bd);
+ for (j = 0; j < 32; ++j) {
+ destT[i] = highbd_clip_pixel_add(destT[i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ destT += stride;
+ }
+ }
+}
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ tran_low_t out[32 * 32] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[32], temp_out[32];
+
+ // Rows
+ // Only upper-left 8x8 has non-zero coeff
+ for (i = 0; i < 8; ++i) {
+ highbd_idct32_c(input, outptr, bd);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
+ highbd_idct32_c(temp_in, temp_out, bd);
+ for (j = 0; j < 32; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+}
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+ int a1;
+ tran_low_t out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/inv_txfm.h b/media/libvpx/libvpx/vpx_dsp/inv_txfm.h
new file mode 100644
index 0000000000..6eedbeac35
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/inv_txfm.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_INV_TXFM_H_
+#define VPX_VPX_DSP_INV_TXFM_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE tran_high_t check_range(tran_high_t input) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ // For valid VP9 input streams, intermediate stage coefficients should always
+ // stay within the range of a signed 16 bit integer. Coefficients can go out
+ // of this range for invalid/corrupt VP9 streams. However, strictly checking
+ // this range for every intermediate coefficient can burdensome for a decoder,
+ // therefore the following assertion is only enabled when configured with
+ // --enable-coefficient-range-checking.
+ assert(INT16_MIN <= input);
+ assert(input <= INT16_MAX);
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ return input;
+}
+
+static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ return (tran_high_t)rv;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+ // stay within the ranges:
+ // - 8 bit: signed 16 bit integer
+ // - 10 bit: signed 18 bit integer
+ // - 12 bit: signed 20 bit integer
+ const int32_t int_max = (1 << (7 + bd)) - 1;
+ const int32_t int_min = -int_max - 1;
+ assert(int_min <= input);
+ assert(input <= int_max);
+ (void)int_min;
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ (void)bd;
+ return input;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+ ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#else // CONFIG_EMULATE_HARDWARE
+
+#define WRAPLOW(x) ((int32_t)check_range(x))
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd))
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // CONFIG_EMULATE_HARDWARE
+
+void idct4_c(const tran_low_t *input, tran_low_t *output);
+void idct8_c(const tran_low_t *input, tran_low_t *output);
+void idct16_c(const tran_low_t *input, tran_low_t *output);
+void idct32_c(const tran_low_t *input, tran_low_t *output);
+void iadst4_c(const tran_low_t *input, tran_low_t *output);
+void iadst8_c(const tran_low_t *input, tran_low_t *output);
+void iadst16_c(const tran_low_t *input, tran_low_t *output);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+ int bd) {
+ trans = HIGHBD_WRAPLOW(trans, bd);
+ return clip_pixel_highbd(dest + (int)trans, bd);
+}
+#endif
+
+static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
+ trans = WRAPLOW(trans);
+ return clip_pixel(dest + (int)trans);
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_INV_TXFM_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c
new file mode 100644
index 0000000000..750c9de29f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h"
+
+void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride,
+ tran_low_t *dst) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ ptrdiff_t src_stride2 = src_stride << 1;
+ ptrdiff_t src_stride3 = src_stride2 + src_stride;
+ ptrdiff_t src_stride4 = src_stride2 << 1;
+ ptrdiff_t src_stride6 = src_stride3 << 1;
+
+ int16_t *src_tmp = (int16_t *)src;
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride6);
+ src_tmp += src_stride4;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6);
+ src7 = __lsx_vldx(src_tmp, src_stride6);
+
+ LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+ tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+ LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+ src4, src5, src7, src6, src3, src2);
+ LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+ tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+ LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+ tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+ LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+ src4, src5, src7, src6, src3, src2);
+ LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+ tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+ store_tran_low(tmp0, dst, 0);
+ store_tran_low(tmp1, dst, 8);
+ store_tran_low(tmp2, dst, 16);
+ store_tran_low(tmp3, dst, 24);
+ store_tran_low(tmp4, dst, 32);
+ store_tran_low(tmp5, dst, 40);
+ store_tran_low(tmp6, dst, 48);
+ store_tran_low(tmp7, dst, 56);
+}
+
+void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride,
+ tran_low_t *dst) {
+ int i;
+ __m128i a0, a1, a2, a3, b0, b1, b2, b3;
+
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0);
+ /* Top right. */
+ vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64);
+ /* Bottom left. */
+ vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128);
+ /* Bottom right. */
+ vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192);
+
+ for (i = 0; i < 64; i += 8) {
+ a0 = load_tran_low(dst);
+ a1 = load_tran_low(dst + 64);
+ a2 = load_tran_low(dst + 128);
+ a3 = load_tran_low(dst + 192);
+
+ LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1);
+ DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3);
+ LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2);
+
+ store_tran_low(a0, dst, 0);
+ store_tran_low(a1, dst, 64);
+ store_tran_low(a2, dst, 128);
+ store_tran_low(a3, dst, 192);
+
+ dst += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c
new file mode 100644
index 0000000000..482626080a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ // width > 8 || width == 8 || width == 4
+ if (width > 8) {
+ int i, j;
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 16) {
+ __m128i p, r, avg;
+
+ p = __lsx_vld(pred + j, 0);
+ r = __lsx_vld(ref + j, 0);
+ avg = __lsx_vavgr_bu(p, r);
+ __lsx_vst(avg, comp_pred + j, 0);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else if (width == 8) {
+ int i = height * width;
+ do {
+ __m128i p, r, r_0, r_1;
+
+ p = __lsx_vld(pred, 0);
+ r_0 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ r_1 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ r = __lsx_vilvl_d(r_1, r_0);
+ r = __lsx_vavgr_bu(p, r);
+
+ __lsx_vst(r, comp_pred, 0);
+
+ pred += 16;
+ comp_pred += 16;
+ i -= 16;
+ } while (i);
+ } else { // width = 4
+ int i = height * width;
+ assert(width == 4);
+ do {
+ __m128i p, r, r_0, r_1, r_2, r_3;
+ p = __lsx_vld(pred, 0);
+
+ if (width == ref_stride) {
+ r = __lsx_vld(ref, 0);
+ ref += 16;
+ } else {
+ r_0 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ r_1 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ r_2 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ r_3 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ DUP2_ARG2(__lsx_vilvl_w, r_1, r_0, r_3, r_2, r_0, r_2);
+ r = __lsx_vilvl_d(r_2, r_0);
+ }
+ r = __lsx_vavgr_bu(p, r);
+
+ __lsx_vst(r, comp_pred, 0);
+ comp_pred += 16;
+ pred += 16;
+ i -= 16;
+ } while (i);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
new file mode 100644
index 0000000000..b0db1e99c5
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i load_tran_low(const tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ __m128i v0_m = __lsx_vld(s, 0);
+ __m128i v1_m = __lsx_vld(s + 4, 0);
+ return __lsx_vsrlni_h_w(v0_m, v1_m, 0);
+#else
+ return __lsx_vld(s, 0);
+#endif
+}
+
+static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ __m128i v0_m, v1_m;
+ v1_m = __lsx_vexth_w_h(v);
+ v0_m = __lsx_vsllwil_w_h(v, 0);
+ __lsx_vst(v0_m, s + c, 0);
+ __lsx_vst(v1_m, s + c + 4, 0);
+#else
+ __lsx_vst(v, s + c, 0);
+#endif
+}
+
+#endif // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
new file mode 100644
index 0000000000..9bb3877212
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
@@ -0,0 +1,1176 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+#include "vpx_dsp/fwd_txfm.h"
+
+#define UNPCK_SH_SW(in, out0, out1) \
+ do { \
+ out0 = __lsx_vsllwil_w_h(in, 0); \
+ out1 = __lsx_vexth_w_h(in); \
+ } while (0)
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+ int32_t src_stride,
+ int16_t *temp_buff) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i step0, step1, step2, step3;
+ __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+ __m128i step0_1, step1_1, step2_1, step3_1;
+
+ int32_t stride = src_stride << 1;
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ const int16_t *input_tmp = (int16_t *)input;
+
+ in0 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+ in3 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp += stride2;
+ in0_1 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+ in3_1 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp = input + (src_stride * 24);
+ in4_1 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+ in7_1 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp += stride2;
+ in4 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+ in7 = __lsx_vldx(input_tmp, stride3);
+
+ DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+ in2_1, in3_1);
+ DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+ in6_1, in7_1);
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+ step3, in4, in5, in6, in7);
+ LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+ in7_1);
+
+ __lsx_vst(step0, temp_buff, 0);
+ __lsx_vst(step1, temp_buff, 16);
+ __lsx_vst(step2, temp_buff, 32);
+ __lsx_vst(step3, temp_buff, 48);
+
+ __lsx_vst(in4, temp_buff, 448);
+ __lsx_vst(in5, temp_buff, 464);
+ __lsx_vst(in6, temp_buff, 480);
+ __lsx_vst(in7, temp_buff, 496);
+
+ __lsx_vst(step0_1, temp_buff, 64);
+ __lsx_vst(step1_1, temp_buff, 80);
+ __lsx_vst(step2_1, temp_buff, 96);
+ __lsx_vst(step3_1, temp_buff, 112);
+
+ __lsx_vst(in4_1, temp_buff, 384);
+ __lsx_vst(in5_1, temp_buff, 400);
+ __lsx_vst(in6_1, temp_buff, 416);
+ __lsx_vst(in7_1, temp_buff, 432);
+
+ /* 3rd and 4th set */
+ input_tmp = input + (src_stride * 8);
+ in0 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+ in3 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp += stride2;
+ in0_1 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+ in3_1 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp += stride2;
+ in4_1 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+ in7_1 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp += stride2;
+ in4 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+ in7 = __lsx_vldx(input_tmp, stride3);
+ DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+ in2_1, in3_1);
+ DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+ in6_1, in7_1);
+
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+ step3, in4, in5, in6, in7);
+ LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+ in7_1);
+
+ __lsx_vst(step0, temp_buff, 128);
+ __lsx_vst(step1, temp_buff, 144);
+ __lsx_vst(step2, temp_buff, 160);
+ __lsx_vst(step3, temp_buff, 176);
+
+ __lsx_vst(in4, temp_buff, 320);
+ __lsx_vst(in5, temp_buff, 336);
+ __lsx_vst(in6, temp_buff, 352);
+ __lsx_vst(in7, temp_buff, 368);
+
+ __lsx_vst(step0_1, temp_buff, 192);
+ __lsx_vst(step1_1, temp_buff, 208);
+ __lsx_vst(step2_1, temp_buff, 224);
+ __lsx_vst(step3_1, temp_buff, 240);
+
+ __lsx_vst(in4_1, temp_buff, 256);
+ __lsx_vst(in5_1, temp_buff, 272);
+ __lsx_vst(in6_1, temp_buff, 288);
+ __lsx_vst(in7_1, temp_buff, 304);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i temp0, temp1;
+
+ /* fdct even */
+ DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+ in3);
+ DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+ in13, in14, in15);
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1,
+ vec2, vec3, in12, in13, in14, in15);
+ DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+ in6, in7);
+ DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+ in10, in11);
+ LSX_BUTTERFLY_8_H(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6,
+ vec7, in8, in9, in10, in11);
+
+ /* Stage 3 */
+ DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+ in1, in2, in3);
+ LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+ DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 0);
+ __lsx_vst(temp1, temp, 1024);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 512);
+ __lsx_vst(temp1, temp, 1536);
+
+ DUP4_ARG2(__lsx_vsub_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7,
+ vec6, vec5, vec4);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 256);
+ __lsx_vst(temp1, temp, 1792);
+
+ DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 1280);
+ __lsx_vst(temp1, temp, 768);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+ vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 128);
+ __lsx_vst(temp1, temp, 1920);
+
+ DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 1152);
+ __lsx_vst(temp1, temp, 896);
+
+ DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+ temp0 = __lsx_vneg_h(vec2);
+ DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+ DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+ vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 640);
+ __lsx_vst(temp1, temp, 1408);
+
+ DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 384);
+ __lsx_vst(temp1, temp, 1664);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+ __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+ __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+ __m128i tmp0, tmp1;
+
+ DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 160, input, 176, in20, in21,
+ in26, in27);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 192, input, 208, in18, in19,
+ in28, in29);
+
+ vec4 = __lsx_vsub_h(in19, in20);
+ __lsx_vst(vec4, input, 64);
+ vec4 = __lsx_vsub_h(in18, in21);
+ __lsx_vst(vec4, input, 80);
+ vec4 = __lsx_vsub_h(in29, in26);
+ __lsx_vst(vec4, input, 160);
+ vec4 = __lsx_vsub_h(in28, in27);
+ __lsx_vst(vec4, input, 176);
+
+ in21 = __lsx_vadd_h(in18, in21);
+ in20 = __lsx_vadd_h(in19, in20);
+ in27 = __lsx_vadd_h(in28, in27);
+ in26 = __lsx_vadd_h(in29, in26);
+
+ DUP4_ARG2(__lsx_vld, input, 96, input, 112, input, 128, input, 144, in22,
+ in23, in24, in25);
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+ DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 224, input, 240, in16, in17,
+ in30, in31);
+
+ vec4 = __lsx_vsub_h(in17, in22);
+ __lsx_vst(vec4, input, 32);
+ vec4 = __lsx_vsub_h(in16, in23);
+ __lsx_vst(vec4, input, 48);
+ vec4 = __lsx_vsub_h(in31, in24);
+ __lsx_vst(vec4, input, 192);
+ vec4 = __lsx_vsub_h(in30, in25);
+ __lsx_vst(vec4, input, 208);
+
+ DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+ in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+ DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+ in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 0);
+ __lsx_vst(vec4, temp_ptr, 1920);
+
+ DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 896);
+ __lsx_vst(vec4, temp_ptr, 1024);
+
+ DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+ in26, in24, in20);
+ tmp0 = __lsx_vneg_h(in23);
+ DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+ DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec4, temp_ptr, 1408);
+ __lsx_vst(vec5, temp_ptr, 512);
+
+ DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec4, temp_ptr, 384);
+ __lsx_vst(vec5, temp_ptr, 1536);
+
+ DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 64, input, 80, in22, in23,
+ in20, in21);
+ DUP4_ARG2(__lsx_vld, input, 160, input, 176, input, 192, input, 208, in26,
+ in27, in24, in25);
+ in16 = in20;
+ in17 = in21;
+ DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+ DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+ DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+ in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 1664);
+ __lsx_vst(vec4, temp_ptr, 256);
+
+ DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 640);
+ __lsx_vst(vec4, temp_ptr, 1280);
+
+ DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+ in29, in30, in19);
+ tmp0 = __lsx_vneg_h(in16);
+ DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+ DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 1152);
+ __lsx_vst(vec4, temp_ptr, 768);
+
+ DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 128);
+ __lsx_vst(vec4, temp_ptr, 1792);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+ int16_t *tmp_buf, int16_t *tmp_buf_big) {
+ fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+ fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+ fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+ int16_t *output) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i step0, step1, step2, step3, step4, step5, step6, step7;
+
+ DUP4_ARG2(__lsx_vld, temp_buff, 0, temp_buff, 64, temp_buff, 128, temp_buff,
+ 192, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, temp_buff, 256, temp_buff, 320, temp_buff, 384,
+ temp_buff, 448, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vld, temp_buff, 48, temp_buff, 112, temp_buff, 176, temp_buff,
+ 240, in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, temp_buff, 304, temp_buff, 368, temp_buff, 432,
+ temp_buff, 496, in12, in13, in14, in15);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, step0, step1, step2, step3,
+ step4, step5, step6, step7, in8, in9, in10, in11, in12,
+ in13, in14, in15);
+
+ __lsx_vst(step0, output, 0);
+ __lsx_vst(step1, output, 16);
+ __lsx_vst(step2, output, 32);
+ __lsx_vst(step3, output, 48);
+ __lsx_vst(step4, output, 64);
+ __lsx_vst(step5, output, 80);
+ __lsx_vst(step6, output, 96);
+ __lsx_vst(step7, output, 112);
+
+ __lsx_vst(in8, output, 384);
+ __lsx_vst(in9, output, 400);
+ __lsx_vst(in10, output, 416);
+ __lsx_vst(in11, output, 432);
+ __lsx_vst(in12, output, 448);
+ __lsx_vst(in13, output, 464);
+ __lsx_vst(in14, output, 480);
+ __lsx_vst(in15, output, 496);
+
+ /* 2nd set */
+ DUP4_ARG2(__lsx_vld, temp_buff, 16, temp_buff, 80, temp_buff, 144, temp_buff,
+ 208, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, temp_buff, 272, temp_buff, 336, temp_buff, 400,
+ temp_buff, 464, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vld, temp_buff, 32, temp_buff, 96, temp_buff, 160, temp_buff,
+ 224, in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, temp_buff, 288, temp_buff, 352, temp_buff, 416,
+ temp_buff, 480, in12, in13, in14, in15);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, step0, step1, step2, step3,
+ step4, step5, step6, step7, in8, in9, in10, in11, in12,
+ in13, in14, in15);
+
+ __lsx_vst(step0, output, 128);
+ __lsx_vst(step1, output, 144);
+ __lsx_vst(step2, output, 160);
+ __lsx_vst(step3, output, 176);
+ __lsx_vst(step4, output, 192);
+ __lsx_vst(step5, output, 208);
+ __lsx_vst(step6, output, 224);
+ __lsx_vst(step7, output, 240);
+
+ __lsx_vst(in8, output, 256);
+ __lsx_vst(in9, output, 272);
+ __lsx_vst(in10, output, 288);
+ __lsx_vst(in11, output, 304);
+ __lsx_vst(in12, output, 320);
+ __lsx_vst(in13, output, 336);
+ __lsx_vst(in14, output, 352);
+ __lsx_vst(in15, output, 368);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+ int16_t *out) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+ __m128i vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+ __m128i tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+ /* fdct32 even */
+ /* stage 2 */
+ DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+ in3);
+ DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+ in6, in7);
+ DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+ in10, in11);
+ DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+ in13, in14, in15);
+
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+ vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+ in15);
+
+ __lsx_vst(vec0, interm_ptr, 0);
+ __lsx_vst(vec1, interm_ptr, 16);
+ __lsx_vst(vec2, interm_ptr, 32);
+ __lsx_vst(vec3, interm_ptr, 48);
+ __lsx_vst(vec4, interm_ptr, 64);
+ __lsx_vst(vec5, interm_ptr, 80);
+ __lsx_vst(vec6, interm_ptr, 96);
+ __lsx_vst(vec7, interm_ptr, 112);
+
+ __lsx_vst(in8, interm_ptr, 128);
+ __lsx_vst(in9, interm_ptr, 144);
+ __lsx_vst(in10, interm_ptr, 160);
+ __lsx_vst(in11, interm_ptr, 176);
+ __lsx_vst(in12, interm_ptr, 192);
+ __lsx_vst(in13, interm_ptr, 208);
+ __lsx_vst(in14, interm_ptr, 224);
+ __lsx_vst(in15, interm_ptr, 240);
+
+ /* Stage 3 */
+ UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+ UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+ UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+ UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+ UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+ UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+ UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+ UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+ DUP4_ARG2(__lsx_vadd_w, vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r,
+ vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+ LSX_BUTTERFLY_4_W(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r,
+ vec5_r);
+ DUP4_ARG2(__lsx_vadd_w, vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l,
+ vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r);
+
+ tmp3_w = __lsx_vadd_w(vec0_r, vec3_r);
+ vec0_r = __lsx_vsub_w(vec0_r, vec3_r);
+ vec3_r = __lsx_vadd_w(vec1_r, vec2_r);
+ vec1_r = __lsx_vsub_w(vec1_r, vec2_r);
+
+ DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
+ vec4_r, tmp3_w, vec6_r, vec3_r);
+ FDCT32_POSTPROC_NEG_W(vec4_r);
+ FDCT32_POSTPROC_NEG_W(tmp3_w);
+ FDCT32_POSTPROC_NEG_W(vec6_r);
+ FDCT32_POSTPROC_NEG_W(vec3_r);
+ DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+ __lsx_vst(vec5, out, 0);
+ __lsx_vst(vec4, out, 16);
+
+ DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
+ vec4_r, tmp3_w, vec6_r, vec3_r);
+ FDCT32_POSTPROC_NEG_W(vec4_r);
+ FDCT32_POSTPROC_NEG_W(tmp3_w);
+ FDCT32_POSTPROC_NEG_W(vec6_r);
+ FDCT32_POSTPROC_NEG_W(vec3_r);
+ DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+ __lsx_vst(vec5, out, 32);
+ __lsx_vst(vec4, out, 48);
+
+ DUP4_ARG2(__lsx_vld, interm_ptr, 0, interm_ptr, 16, interm_ptr, 32,
+ interm_ptr, 48, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, interm_ptr, 64, interm_ptr, 80, interm_ptr, 96,
+ interm_ptr, 112, vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+ vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 64);
+ __lsx_vst(in5, out, 112);
+
+ DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 80);
+ __lsx_vst(in5, out, 96);
+
+ DUP4_ARG2(__lsx_vld, interm_ptr, 128, interm_ptr, 144, interm_ptr, 160,
+ interm_ptr, 176, in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, interm_ptr, 192, interm_ptr, 208, interm_ptr, 224,
+ interm_ptr, 240, in12, in13, in14, in15);
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+ vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 128);
+ __lsx_vst(in5, out, 240);
+
+ DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 144);
+ __lsx_vst(in5, out, 224);
+
+ DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+ tmp0_w = __lsx_vneg_h(vec2);
+ DOTP_CONST_PAIR(tmp0_w, vec5, cospi_24_64, cospi_8_64, in2, in1);
+ DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+ vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 160);
+ __lsx_vst(in5, out, 208);
+
+ DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 192);
+ __lsx_vst(in5, out, 176);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+ /* fdct32 even */
+ /* stage 2 */
+ DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+ in3);
+ DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+ in7);
+ DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+ in10, in11);
+ DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+ in14, in15);
+
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+ vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+ in15);
+ /* Stage 3 */
+ DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+ in1, in2, in3);
+ LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+ DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 0);
+ __lsx_vst(temp1, out, 16);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 32);
+ __lsx_vst(temp1, out, 48);
+
+ DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+ vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 64);
+ __lsx_vst(temp1, out, 112);
+
+ DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 80);
+ __lsx_vst(temp1, out, 96);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+ vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 128);
+ __lsx_vst(temp1, out, 240);
+
+ DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 144);
+ __lsx_vst(temp1, out, 224);
+
+ DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+ temp0 = __lsx_vneg_h(vec2);
+ DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+ DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+ vec2, vec5)
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 160);
+ __lsx_vst(temp1, out, 208);
+
+ DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 192);
+ __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+ int16_t *out) {
+ __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+ __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+ __m128i tmp0, tmp1;
+
+ in20 = __lsx_vld(temp, 64);
+ in21 = __lsx_vld(temp, 80);
+ in26 = __lsx_vld(temp, 160);
+ in27 = __lsx_vld(temp, 176);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ in18 = __lsx_vld(temp, 32);
+ in19 = __lsx_vld(temp, 48);
+ in28 = __lsx_vld(temp, 192);
+ in29 = __lsx_vld(temp, 208);
+
+ vec4 = __lsx_vsub_h(in19, in20);
+ __lsx_vst(vec4, interm_ptr, 64);
+ vec4 = __lsx_vsub_h(in18, in21);
+ __lsx_vst(vec4, interm_ptr, 176);
+ vec4 = __lsx_vsub_h(in28, in27);
+ __lsx_vst(vec4, interm_ptr, 112);
+ vec4 = __lsx_vsub_h(in29, in26);
+ __lsx_vst(vec4, interm_ptr, 128);
+
+ DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+ in20, in27, in26);
+
+ in22 = __lsx_vld(temp, 96);
+ in23 = __lsx_vld(temp, 112);
+ in24 = __lsx_vld(temp, 128);
+ in25 = __lsx_vld(temp, 144);
+
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+ in16 = __lsx_vld(temp, 0);
+ in17 = __lsx_vld(temp, 16);
+ in30 = __lsx_vld(temp, 224);
+ in31 = __lsx_vld(temp, 240);
+
+ vec4 = __lsx_vsub_h(in17, in22);
+ __lsx_vst(vec4, interm_ptr, 80);
+ vec4 = __lsx_vsub_h(in30, in25);
+ __lsx_vst(vec4, interm_ptr, 96);
+ vec4 = __lsx_vsub_h(in31, in24);
+ __lsx_vst(vec4, interm_ptr, 144);
+ vec4 = __lsx_vsub_h(in16, in23);
+ __lsx_vst(vec4, interm_ptr, 160);
+
+ DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+ in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+ DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+ in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec5, out, 0);
+ __lsx_vst(vec4, out, 240);
+
+ DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec5, out, 224);
+ __lsx_vst(vec4, out, 16);
+
+ DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+ in26, in24, in20);
+ tmp0 = __lsx_vneg_h(in23);
+ DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+ DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec4, out, 32);
+ __lsx_vst(vec5, out, 208);
+
+ DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec4, out, 48);
+ __lsx_vst(vec5, out, 192);
+
+ in20 = __lsx_vld(interm_ptr, 64);
+ in21 = __lsx_vld(interm_ptr, 176);
+ in27 = __lsx_vld(interm_ptr, 112);
+ in26 = __lsx_vld(interm_ptr, 128);
+
+ in16 = in20;
+ in17 = in21;
+ DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+ DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+ in22 = __lsx_vld(interm_ptr, 80);
+ in25 = __lsx_vld(interm_ptr, 96);
+ in24 = __lsx_vld(interm_ptr, 144);
+ in23 = __lsx_vld(interm_ptr, 160);
+
+ DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+ in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec5, out, 64);
+ __lsx_vst(vec4, out, 176);
+
+ DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec5, out, 80);
+ __lsx_vst(vec4, out, 160);
+
+ DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+ in29, in30, in19);
+ tmp0 = __lsx_vneg_h(in16);
+ DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+ DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec5, out, 144);
+ __lsx_vst(vec4, out, 96);
+
+ DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec4, out, 112);
+ __lsx_vst(vec5, out, 128);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+ /* 1st set */
+ in0 = __lsx_vld(temp, 0);
+ in4 = __lsx_vld(temp, 64);
+ in2 = __lsx_vld(temp, 128);
+ in6 = __lsx_vld(temp, 192);
+ in1 = __lsx_vld(temp, 256);
+ in7 = __lsx_vld(temp, 304);
+ in3 = __lsx_vld(temp, 384);
+ in5 = __lsx_vld(temp, 432);
+
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ /* 2nd set */
+ in0_1 = __lsx_vld(temp, 32);
+ in1_1 = __lsx_vld(temp, 464);
+ in2_1 = __lsx_vld(temp, 160);
+ in3_1 = __lsx_vld(temp, 336);
+ in4_1 = __lsx_vld(temp, 96);
+ in5_1 = __lsx_vld(temp, 352);
+ in6_1 = __lsx_vld(temp, 224);
+ in7_1 = __lsx_vld(temp, 480);
+
+ __lsx_vst(in0, output, 0);
+ __lsx_vst(in1, output, 64);
+ __lsx_vst(in2, output, 128);
+ __lsx_vst(in3, output, 192);
+ __lsx_vst(in4, output, 256);
+ __lsx_vst(in5, output, 320);
+ __lsx_vst(in6, output, 384);
+ __lsx_vst(in7, output, 448);
+
+ LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+ /* 3rd set */
+ in0 = __lsx_vld(temp, 16);
+ in1 = __lsx_vld(temp, 272);
+ in2 = __lsx_vld(temp, 144);
+ in3 = __lsx_vld(temp, 400);
+ in4 = __lsx_vld(temp, 80);
+ in5 = __lsx_vld(temp, 416);
+ in6 = __lsx_vld(temp, 208);
+ in7 = __lsx_vld(temp, 288);
+
+ __lsx_vst(in0_1, output, 16);
+ __lsx_vst(in1_1, output, 80);
+ __lsx_vst(in2_1, output, 144);
+ __lsx_vst(in3_1, output, 208);
+ __lsx_vst(in4_1, output, 272);
+ __lsx_vst(in5_1, output, 336);
+ __lsx_vst(in6_1, output, 400);
+ __lsx_vst(in7_1, output, 464);
+
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ __lsx_vst(in0, output, 32);
+ __lsx_vst(in1, output, 96);
+ __lsx_vst(in2, output, 160);
+ __lsx_vst(in3, output, 224);
+ __lsx_vst(in4, output, 288);
+ __lsx_vst(in5, output, 352);
+ __lsx_vst(in6, output, 416);
+ __lsx_vst(in7, output, 480);
+
+ /* 4th set */
+ in0_1 = __lsx_vld(temp, 48);
+ in1_1 = __lsx_vld(temp, 448);
+ in2_1 = __lsx_vld(temp, 176);
+ in3_1 = __lsx_vld(temp, 320);
+ in4_1 = __lsx_vld(temp, 112);
+ in5_1 = __lsx_vld(temp, 368);
+ in6_1 = __lsx_vld(temp, 240);
+ in7_1 = __lsx_vld(temp, 496);
+
+ LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+ __lsx_vst(in0_1, output, 48);
+ __lsx_vst(in1_1, output, 112);
+ __lsx_vst(in2_1, output, 176);
+ __lsx_vst(in3_1, output, 240);
+ __lsx_vst(in4_1, output, 304);
+ __lsx_vst(in5_1, output, 368);
+ __lsx_vst(in6_1, output, 432);
+ __lsx_vst(in7_1, output, 496);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+ fdct8x32_1d_row_even(temp_buf, temp_buf);
+ fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+ fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+ int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+ fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ int i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+ /* column transform */
+ for (i = 0; i < 4; ++i) {
+ fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+ tmp_buf_big + (8 * i));
+ }
+
+ /* row transform */
+ fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+ /* row transform */
+ for (i = 1; i < 4; ++i) {
+ fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+ }
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+ /* fdct32 even */
+ /* stage 2 */
+ DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+ in3);
+ DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+ in7);
+ DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+ in10, in11);
+ DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+ in14, in15);
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+ vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+ in15);
+
+ FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+ FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+ FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+ FDCT_POSTPROC_2V_NEG_H(in8, in9);
+ FDCT_POSTPROC_2V_NEG_H(in10, in11);
+ FDCT_POSTPROC_2V_NEG_H(in12, in13);
+ FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+ /* Stage 3 */
+ DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+ in1, in2, in3);
+
+ temp0 = __lsx_vadd_h(in0, in3);
+ in0 = __lsx_vsub_h(in0, in3);
+ in3 = __lsx_vadd_h(in1, in2);
+ in1 = __lsx_vsub_h(in1, in2);
+
+ DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+ __lsx_vst(temp0, out, 0);
+ __lsx_vst(temp1, out, 16);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ __lsx_vst(temp0, out, 32);
+ __lsx_vst(temp1, out, 48);
+
+ DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+ vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ __lsx_vst(temp0, out, 64);
+ __lsx_vst(temp1, out, 112);
+
+ DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ __lsx_vst(temp0, out, 80);
+ __lsx_vst(temp1, out, 96);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+ vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ __lsx_vst(temp0, out, 128);
+ __lsx_vst(temp1, out, 240);
+
+ DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ __lsx_vst(temp0, out, 144);
+ __lsx_vst(temp1, out, 224);
+
+ DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+ temp0 = __lsx_vneg_h(vec2);
+ DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+ DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+ vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ __lsx_vst(temp0, out, 160);
+ __lsx_vst(temp1, out, 208);
+
+ DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ __lsx_vst(temp0, out, 192);
+ __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+ int16_t *out) {
+ __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+ __m128i in24, in25, in26, in27, in28, in29, in30, in31;
+ __m128i vec4, vec5, tmp0, tmp1;
+
+ in20 = __lsx_vld(temp, 64);
+ in21 = __lsx_vld(temp, 80);
+ in26 = __lsx_vld(temp, 160);
+ in27 = __lsx_vld(temp, 176);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ FDCT_POSTPROC_2V_NEG_H(in20, in21);
+ FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+ in18 = __lsx_vld(temp, 32);
+ in19 = __lsx_vld(temp, 48);
+ in28 = __lsx_vld(temp, 192);
+ in29 = __lsx_vld(temp, 208);
+
+ FDCT_POSTPROC_2V_NEG_H(in18, in19);
+ FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+ vec4 = __lsx_vsub_h(in19, in20);
+ __lsx_vst(vec4, interm_ptr, 64);
+ vec4 = __lsx_vsub_h(in18, in21);
+ __lsx_vst(vec4, interm_ptr, 176);
+ vec4 = __lsx_vsub_h(in29, in26);
+ __lsx_vst(vec4, interm_ptr, 128);
+ vec4 = __lsx_vsub_h(in28, in27);
+ __lsx_vst(vec4, interm_ptr, 112);
+
+ DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+ in20, in27, in26);
+
+ in22 = __lsx_vld(temp, 96);
+ in23 = __lsx_vld(temp, 112);
+ in24 = __lsx_vld(temp, 128);
+ in25 = __lsx_vld(temp, 144);
+
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+ FDCT_POSTPROC_2V_NEG_H(in22, in23);
+ FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+ in16 = __lsx_vld(temp, 0);
+ in17 = __lsx_vld(temp, 16);
+ in30 = __lsx_vld(temp, 224);
+ in31 = __lsx_vld(temp, 240);
+
+ FDCT_POSTPROC_2V_NEG_H(in16, in17);
+ FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+ vec4 = __lsx_vsub_h(in17, in22);
+ __lsx_vst(vec4, interm_ptr, 80);
+ vec4 = __lsx_vsub_h(in30, in25);
+ __lsx_vst(vec4, interm_ptr, 96);
+ vec4 = __lsx_vsub_h(in31, in24);
+ __lsx_vst(vec4, interm_ptr, 144);
+ vec4 = __lsx_vsub_h(in16, in23);
+ __lsx_vst(vec4, interm_ptr, 160);
+
+ DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+ in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+ DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+ in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ __lsx_vst(vec5, out, 0);
+ __lsx_vst(vec4, out, 240);
+
+ DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ __lsx_vst(vec5, out, 224);
+ __lsx_vst(vec4, out, 16);
+
+ DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+ in26, in24, in20);
+ tmp0 = __lsx_vneg_h(in23);
+ DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+ DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ __lsx_vst(vec4, out, 32);
+ __lsx_vst(vec5, out, 208);
+
+ DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ __lsx_vst(vec4, out, 48);
+ __lsx_vst(vec5, out, 192);
+
+ in20 = __lsx_vld(interm_ptr, 64);
+ in21 = __lsx_vld(interm_ptr, 176);
+ in27 = __lsx_vld(interm_ptr, 112);
+ in26 = __lsx_vld(interm_ptr, 128);
+
+ in16 = in20;
+ in17 = in21;
+ DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+ DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+ in22 = __lsx_vld(interm_ptr, 80);
+ in25 = __lsx_vld(interm_ptr, 96);
+ in24 = __lsx_vld(interm_ptr, 144);
+ in23 = __lsx_vld(interm_ptr, 160);
+
+ DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+ in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ in16 = __lsx_vadd_h(in28, in29);
+ in19 = __lsx_vadd_h(in31, in30);
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ __lsx_vst(vec5, out, 64);
+ __lsx_vst(vec4, out, 176);
+
+ DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ __lsx_vst(vec5, out, 80);
+ __lsx_vst(vec4, out, 160);
+
+ DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+ in29, in30, in19);
+ tmp0 = __lsx_vneg_h(in16);
+ DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+ DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ __lsx_vst(vec5, out, 144);
+ __lsx_vst(vec4, out, 96);
+
+ DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ __lsx_vst(vec4, out, 112);
+ __lsx_vst(vec5, out, 128);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+ int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+ fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+ fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_rd_lsx(const int16_t *input, int16_t *out,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+ /* column transform */
+ for (i = 0; i < 4; ++i) {
+ fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+ &tmp_buf_big[0] + (8 * i));
+ }
+ /* row transform */
+ for (i = 0; i < 4; ++i) {
+ fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+ out + (8 * i * 32));
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c
new file mode 100644
index 0000000000..508532b9d8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ do { \
+ __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \
+ \
+ DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \
+ DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \
+ _t0 = __lsx_vilvl_h(_s1, _s0); \
+ _t1 = __lsx_vilvh_h(_s1, _s0); \
+ _t2 = __lsx_vilvl_h(_s3, _s2); \
+ _t3 = __lsx_vilvh_h(_s3, _s2); \
+ DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \
+ DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \
+ } while (0)
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+ int32_t src_stride) {
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+ __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+ __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };
+ __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };
+ __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 };
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t src_stride6 = src_stride4 + src_stride2;
+ int32_t src_stride8 = src_stride4 << 1;
+ int16_t *input_tmp = (int16_t *)input;
+ in0 = __lsx_vld(input_tmp, 0);
+ DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+ input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4);
+ input_tmp += src_stride4;
+ DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+ input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8);
+ input_tmp += src_stride4;
+ DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+ input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11,
+ in12);
+ input_tmp += src_stride4;
+ DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13,
+ in14);
+ input_tmp += src_stride2;
+ in15 = __lsx_vldx(input_tmp, src_stride2);
+
+ DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+ in11);
+ DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+ in15);
+ DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0,
+ tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5,
+ tmp6, tmp7);
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ __lsx_vst(tmp0, tmp_ptr, 0);
+ __lsx_vst(tmp1, tmp_ptr, 64);
+ __lsx_vst(tmp2, tmp_ptr, 128);
+ __lsx_vst(tmp3, tmp_ptr, 192);
+ __lsx_vst(tmp4, tmp_ptr, 256);
+ __lsx_vst(tmp5, tmp_ptr, 320);
+ __lsx_vst(tmp6, tmp_ptr, 384);
+ __lsx_vst(tmp7, tmp_ptr, 448);
+ DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15,
+ in14, in13, in12);
+ DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10,
+ in9, in8);
+
+ tmp_ptr += 16;
+
+ /* stp 1 */
+ DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4);
+ DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5);
+
+ cnst4 = __lsx_vreplvei_h(coeff, 0);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25);
+
+ cnst5 = __lsx_vreplvei_h(coeff, 1);
+ cnst5 = __lsx_vpackev_h(cnst5, cnst4);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23);
+
+ /* stp2 */
+ LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+ LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+ DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4);
+ DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26);
+
+ cnst0 = __lsx_vreplvei_h(coeff, 4);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21);
+
+ LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+ vec1 = __lsx_vilvl_h(in15, in8);
+ vec0 = __lsx_vilvh_h(in15, in8);
+
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 0);
+
+ cnst0 = __lsx_vreplvei_h(coeff2, 0);
+ cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 448);
+
+ vec1 = __lsx_vilvl_h(in14, in9);
+ vec0 = __lsx_vilvh_h(in14, in9);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+ __lsx_vst(in8, tmp_ptr, 256);
+
+ cnst1 = __lsx_vreplvei_h(coeff2, 2);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 192);
+
+ DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25);
+
+ cnst1 = __lsx_vreplvei_h(coeff, 3);
+ cnst1 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22);
+
+ /* stp4 */
+ DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10);
+
+ vec1 = __lsx_vilvl_h(in13, in10);
+ vec0 = __lsx_vilvh_h(in13, in10);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 128);
+
+ cnst0 = __lsx_vreplvei_h(coeff2, 1);
+ cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 320);
+
+ DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11);
+ vec1 = __lsx_vilvl_h(in12, in11);
+ vec0 = __lsx_vilvh_h(in12, in11);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+ __lsx_vst(in8, tmp_ptr, 384);
+
+ cnst1 = __lsx_vreplvei_h(coeff2, 3);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 64);
+}
+
+void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ int16_t *input_tmp = input;
+
+ DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2,
+ in3);
+ DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5,
+ in6, in7);
+ DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp,
+ 112, in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208,
+ input_tmp, 240, in12, in13, in14, in15);
+
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10,
+ in11);
+ DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13,
+ in14, in15);
+
+ DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+ in11);
+ DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+ in15);
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4,
+ tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14,
+ in15);
+ __lsx_vst(in8, input, 0);
+ __lsx_vst(in9, input, 32);
+ __lsx_vst(in10, input, 64);
+ __lsx_vst(in11, input, 96);
+ __lsx_vst(in12, input, 128);
+ __lsx_vst(in13, input, 160);
+ __lsx_vst(in14, input, 192);
+ __lsx_vst(in15, input, 224);
+
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9,
+ in10, in11);
+ DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12,
+ in13, in14, in15);
+ FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+ tmp1, in1, tmp2, in2, tmp3, in3);
+ __lsx_vst(tmp0, output, 0);
+ __lsx_vst(in0, output, 32);
+ __lsx_vst(tmp1, output, 64);
+ __lsx_vst(in1, output, 96);
+ __lsx_vst(tmp2, output, 128);
+ __lsx_vst(in2, output, 160);
+ __lsx_vst(tmp3, output, 192);
+ __lsx_vst(in3, output, 224);
+
+ LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+ tmp5, in5, tmp6, in6, tmp7, in7);
+ __lsx_vst(tmp4, output, 16);
+ __lsx_vst(in4, output, 48);
+ __lsx_vst(tmp5, output, 80);
+ __lsx_vst(in5, output, 112);
+ __lsx_vst(tmp6, output, 144);
+ __lsx_vst(in6, output, 176);
+ __lsx_vst(tmp7, output, 208);
+ __lsx_vst(in7, output, 240);
+}
+
+void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ __m128i in0, in1, in2, in3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t src_stride6 = src_stride4 + src_stride2;
+
+ in0 = __lsx_vld(input, 0);
+ DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
+ in3 = __lsx_vldx(input, src_stride6);
+
+ /* fdct4 pre-process */
+ {
+ __m128i vec, mask;
+ __m128i zero = __lsx_vldi(0);
+
+ mask = __lsx_vinsgr2vr_b(zero, 1, 0);
+ DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
+ in3);
+ vec = __lsx_vseqi_h(in0, 0);
+ vec = __lsx_vxori_b(vec, 255);
+ vec = __lsx_vand_v(mask, vec);
+ in0 = __lsx_vadd_h(in0, vec);
+ }
+
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
+ __lsx_vst(in0, output, 0);
+ __lsx_vst(in2, output, 16);
+}
+
+void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t src_stride6 = src_stride4 + src_stride2;
+ int16_t *input_tmp = (int16_t *)input;
+
+ in0 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
+ in2);
+ in3 = __lsx_vldx(input_tmp, src_stride6);
+ input_tmp += src_stride4;
+ in4 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
+ in6);
+ in7 = __lsx_vldx(input_tmp, src_stride6);
+
+ DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+
+ __lsx_vst(in0, output, 0);
+ __lsx_vst(in1, output, 16);
+ __lsx_vst(in2, output, 32);
+ __lsx_vst(in3, output, 48);
+ __lsx_vst(in4, output, 64);
+ __lsx_vst(in5, output, 80);
+ __lsx_vst(in6, output, 96);
+ __lsx_vst(in7, output, 112);
+}
+
+void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+ }
+}
+#endif // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h
new file mode 100644
index 0000000000..4a9fce9a3d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+
+#include "vpx_dsp/loongarch/txfm_macros_lsx.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ do { \
+ __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \
+ __m128i vec4_m, vec5_m, vec6_m, vec7_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \
+ \
+ LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
+ DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \
+ cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \
+ vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
+ \
+ vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \
+ cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \
+ cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \
+ vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
+ \
+ DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \
+ vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
+ vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \
+ } while (0)
+
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ do { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
+ __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \
+ \
+ /* FDCT stage1 */ \
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
+ s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
+ LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \
+ x2_m = __lsx_vneg_h(x2_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \
+ x2_m = __lsx_vreplvei_h(coeff_m, 2); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \
+ \
+ /* stage2 */ \
+ s1_m = __lsx_vilvl_h(s5_m, s6_m); \
+ s0_m = __lsx_vilvh_h(s5_m, s6_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \
+ \
+ /* stage3 */ \
+ LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x0_m, x1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \
+ \
+ x1_m = __lsx_vreplvei_h(coeff_m, 5); \
+ x0_m = __lsx_vneg_h(x0_m); \
+ x0_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \
+ x2_m = __lsx_vreplvei_h(coeff_m, 6); \
+ x3_m = __lsx_vneg_h(x3_m); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \
+ } while (0)
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
+ do { \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ \
+ DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \
+ vec1_m, vec2_m, vec3_m); \
+ DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \
+ vec5_m, vec6_m, vec7_m); \
+ DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \
+ in3, in0, in1, in2, in3); \
+ DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \
+ in7, in4, in5, in6, in7); \
+ } while (0)
+
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
+ do { \
+ __m128i tp0_m, tp1_m; \
+ __m128i one = __lsx_vreplgr2vr_h(1); \
+ \
+ tp0_m = __lsx_vslei_h(vec0, 0); \
+ tp1_m = __lsx_vslei_h(vec1, 0); \
+ tp0_m = __lsx_vxori_b(tp0_m, 255); \
+ tp1_m = __lsx_vxori_b(tp1_m, 255); \
+ vec0 = __lsx_vadd_h(vec0, one); \
+ vec1 = __lsx_vadd_h(vec1, one); \
+ tp0_m = __lsx_vand_v(one, tp0_m); \
+ tp1_m = __lsx_vand_v(one, tp1_m); \
+ vec0 = __lsx_vadd_h(vec0, tp0_m); \
+ vec1 = __lsx_vadd_h(vec1, tp1_m); \
+ vec0 = __lsx_vsrai_h(vec0, 2); \
+ vec1 = __lsx_vsrai_h(vec1, 2); \
+ } while (0)
+
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
+ do { \
+ __m128i tp0_m, tp1_m; \
+ __m128i one_m = __lsx_vldi(0x401); \
+ \
+ tp0_m = __lsx_vslti_h(vec0, 0); \
+ tp1_m = __lsx_vslti_h(vec1, 0); \
+ vec0 = __lsx_vadd_h(vec0, one_m); \
+ vec1 = __lsx_vadd_h(vec1, one_m); \
+ tp0_m = __lsx_vand_v(one_m, tp0_m); \
+ tp1_m = __lsx_vand_v(one_m, tp1_m); \
+ vec0 = __lsx_vadd_h(vec0, tp0_m); \
+ vec1 = __lsx_vadd_h(vec1, tp1_m); \
+ vec0 = __lsx_vsrai_h(vec0, 2); \
+ vec1 = __lsx_vsrai_h(vec1, 2); \
+ } while (0)
+
+#define FDCT32_POSTPROC_NEG_W(vec) \
+ do { \
+ __m128i temp_m; \
+ __m128i one_m = __lsx_vreplgr2vr_w(1); \
+ \
+ temp_m = __lsx_vslti_w(vec, 0); \
+ vec = __lsx_vadd_w(vec, one_m); \
+ temp_m = __lsx_vand_v(one_m, temp_m); \
+ vec = __lsx_vadd_w(vec, temp_m); \
+ vec = __lsx_vsrai_w(vec, 2); \
+ } while (0)
+
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
+ const0, const1, out0, out1, out2, out3) \
+ do { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1; \
+ __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0); \
+ \
+ s0_m = __lsx_vreplgr2vr_w((int32_t)const1); \
+ k0_m = __lsx_vpackev_w(s0_m, k0_m); \
+ \
+ DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1); \
+ s1_m = __lsx_vilvl_w(_tmp0, reg0_left); \
+ s0_m = __lsx_vilvh_w(_tmp0, reg0_left); \
+ s3_m = __lsx_vilvl_w(reg0_left, reg1_left); \
+ s2_m = __lsx_vilvh_w(reg0_left, reg1_left); \
+ s5_m = __lsx_vilvl_w(_tmp1, reg0_right); \
+ s4_m = __lsx_vilvh_w(_tmp1, reg0_right); \
+ s7_m = __lsx_vilvl_w(reg0_right, reg1_right); \
+ s6_m = __lsx_vilvh_w(reg0_right, reg1_right); \
+ DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m); \
+ DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m); \
+ DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+ DCT_CONST_BITS, out0, out1); \
+ DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m); \
+ DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m); \
+ DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+ DCT_CONST_BITS, out2, out3); \
+ } while (0)
+
+#define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2, \
+ in3) \
+ do { \
+ __m128i dst0_m, dst1_m, dst2_m, dst3_m; \
+ __m128i tmp0_m, tmp1_m; \
+ __m128i res0_m, res1_m, res2_m, res3_m; \
+ \
+ dst0_m = __lsx_vld(dst, 0); \
+ DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m); \
+ dst3_m = __lsx_vldx(dst, _stride3); \
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \
+ res0_m, res1_m, res2_m, res3_m); \
+ DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m, \
+ in3, res0_m, res1_m, res2_m, res3_m); \
+ DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0, \
+ tmp0_m, tmp1_m); \
+ __lsx_vstelm_d(tmp0_m, dst, 0, 0); \
+ __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1); \
+ __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0); \
+ __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \
+ } while (0)
+
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ do { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ __m128i x0_m, x1_m, x2_m, x3_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \
+ \
+ /* FDCT stage1 */ \
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
+ s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
+ LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \
+ x2_m = __lsx_vneg_h(x2_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \
+ x2_m = __lsx_vreplvei_h(coeff_m, 2); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \
+ \
+ /* stage2 */ \
+ s1_m = __lsx_vilvl_h(s5_m, s6_m); \
+ s0_m = __lsx_vilvh_h(s5_m, s6_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \
+ \
+ /* stage3 */ \
+ LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x0_m, x1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \
+ \
+ x1_m = __lsx_vreplvei_h(coeff_m, 5); \
+ x0_m = __lsx_vneg_h(x0_m); \
+ x0_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \
+ \
+ x2_m = __lsx_vreplvei_h(coeff_m, 6); \
+ x3_m = __lsx_vneg_h(x3_m); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \
+ } while (0)
+
+#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \
+ input7, out1, out3, out5, out7, out9, out11, out13, \
+ out15) \
+ do { \
+ __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \
+ __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \
+ __m128i stp36_m, stp37_m, vec0_m, vec1_m; \
+ __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \
+ __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; \
+ __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; \
+ __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 }; \
+ \
+ /* stp 1 */ \
+ DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \
+ DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \
+ \
+ cnst4_m = __lsx_vreplvei_h(coeff_m, 0); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m); \
+ \
+ cnst5_m = __lsx_vreplvei_h(coeff_m, 1); \
+ cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m); \
+ \
+ /* stp2 */ \
+ LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, \
+ stp32_m, stp33_m); \
+ LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, \
+ stp35_m, stp34_m); \
+ \
+ DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, \
+ vec4_m); \
+ DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, \
+ vec5_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff_m, 4); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff_m, 3); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m); \
+ \
+ /* stp4 */ \
+ LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, \
+ vec4_m, vec5_m); \
+ LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, \
+ stp24_m, stp31_m); \
+ \
+ vec1_m = __lsx_vilvl_h(vec2_m, vec6_m); \
+ vec0_m = __lsx_vilvh_h(vec2_m, vec6_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff2_m, 0); \
+ cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15); \
+ \
+ vec1_m = __lsx_vilvl_h(vec4_m, vec5_m); \
+ vec0_m = __lsx_vilvh_h(vec4_m, vec5_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9); \
+ \
+ cnst1_m = __lsx_vreplvei_h(coeff2_m, 2); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7); \
+ \
+ vec1_m = __lsx_vilvl_h(stp23_m, stp21_m); \
+ vec0_m = __lsx_vilvh_h(stp23_m, stp21_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff2_m, 1); \
+ cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11); \
+ \
+ vec1_m = __lsx_vilvl_h(stp24_m, stp31_m); \
+ vec0_m = __lsx_vilvh_h(stp24_m, stp31_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13); \
+ \
+ cnst1_m = __lsx_vreplvei_h(coeff2_m, 3); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3); \
+ } while (0)
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+ int32_t src_stride);
+void fdct16x8_1d_row(int16_t *input, int16_t *output);
+#endif // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c
new file mode 100644
index 0000000000..ec07f57d90
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c
@@ -0,0 +1,834 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#define UNPCK_UB_SH(_in, _out0, _out1) \
+ do { \
+ _out0 = __lsx_vsllwil_hu_bu(_in, 0); \
+ _out1 = __lsx_vexth_hu_bu(_in); \
+ } while (0)
+
+static void idct32x8_row_transpose_store(const int16_t *input,
+ int16_t *tmp_buf) {
+ __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+ __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+
+ /* 1st & 2nd 8x8 */
+ DUP4_ARG2(__lsx_vld, input, 0, input, 64, input, 128, input, 192, m0, n0, m1,
+ n1);
+ DUP4_ARG2(__lsx_vld, input, 256, input, 320, input, 384, input, 448, m2, n2,
+ m3, n3);
+ DUP4_ARG2(__lsx_vld, input, 16, input, 80, input, 144, input, 208, m4, n4, m5,
+ n5);
+ DUP4_ARG2(__lsx_vld, input, 272, input, 336, input, 400, input, 464, m6, n6,
+ m7, n7);
+
+ LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+
+ __lsx_vst(m0, tmp_buf, 0);
+ __lsx_vst(n0, tmp_buf, 16);
+ __lsx_vst(m1, tmp_buf, 32);
+ __lsx_vst(n1, tmp_buf, 48);
+ __lsx_vst(m2, tmp_buf, 64);
+ __lsx_vst(n2, tmp_buf, 80);
+ __lsx_vst(m3, tmp_buf, 96);
+ __lsx_vst(n3, tmp_buf, 112);
+ __lsx_vst(m4, tmp_buf, 128);
+ __lsx_vst(n4, tmp_buf, 144);
+ __lsx_vst(m5, tmp_buf, 160);
+ __lsx_vst(n5, tmp_buf, 176);
+ __lsx_vst(m6, tmp_buf, 192);
+ __lsx_vst(n6, tmp_buf, 208);
+ __lsx_vst(m7, tmp_buf, 224);
+ __lsx_vst(n7, tmp_buf, 240);
+
+ /* 3rd & 4th 8x8 */
+ DUP4_ARG2(__lsx_vld, input, 32, input, 96, input, 160, input, 224, m0, n0, m1,
+ n1);
+ DUP4_ARG2(__lsx_vld, input, 288, input, 352, input, 416, input, 480, m2, n2,
+ m3, n3);
+ DUP4_ARG2(__lsx_vld, input, 48, input, 112, input, 176, input, 240, m4, n4,
+ m5, n5);
+ DUP4_ARG2(__lsx_vld, input, 304, input, 368, input, 432, input, 496, m6, n6,
+ m7, n7);
+
+ LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+
+ __lsx_vst(m0, tmp_buf, 256);
+ __lsx_vst(n0, tmp_buf, 272);
+ __lsx_vst(m1, tmp_buf, 288);
+ __lsx_vst(n1, tmp_buf, 304);
+ __lsx_vst(m2, tmp_buf, 320);
+ __lsx_vst(n2, tmp_buf, 336);
+ __lsx_vst(m3, tmp_buf, 352);
+ __lsx_vst(n3, tmp_buf, 368);
+ __lsx_vst(m4, tmp_buf, 384);
+ __lsx_vst(n4, tmp_buf, 400);
+ __lsx_vst(m5, tmp_buf, 416);
+ __lsx_vst(n5, tmp_buf, 432);
+ __lsx_vst(m6, tmp_buf, 448);
+ __lsx_vst(n6, tmp_buf, 464);
+ __lsx_vst(m7, tmp_buf, 480);
+ __lsx_vst(n7, tmp_buf, 496);
+}
+
+static void idct32x8_row_even_process_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+ __m128i tmp0;
+
+ /* Even stage 1 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 64, tmp_buf, 128, tmp_buf, 192,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 320, tmp_buf, 384, tmp_buf, 448,
+ reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+ LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ loc1 = vec3;
+ loc0 = vec1;
+
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+ LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+ LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+ LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+ /* Even stage 2 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 32, tmp_buf, 96, tmp_buf, 160, tmp_buf, 224,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 288, tmp_buf, 352, tmp_buf, 416, tmp_buf, 480,
+ reg4, reg5, reg6, reg7);
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+ vec0 = __lsx_vadd_h(reg0, reg4);
+ reg0 = __lsx_vsub_h(reg0, reg4);
+ reg4 = __lsx_vadd_h(reg6, reg2);
+ reg6 = __lsx_vsub_h(reg6, reg2);
+ reg2 = __lsx_vadd_h(reg1, reg5);
+ reg1 = __lsx_vsub_h(reg1, reg5);
+ reg5 = __lsx_vadd_h(reg7, reg3);
+ reg7 = __lsx_vsub_h(reg7, reg3);
+ reg3 = vec0;
+
+ vec1 = reg2;
+ reg2 = __lsx_vadd_h(reg3, reg4);
+ reg3 = __lsx_vsub_h(reg3, reg4);
+ reg4 = __lsx_vsub_h(reg5, vec1);
+ reg5 = __lsx_vadd_h(reg5, vec1);
+
+ tmp0 = __lsx_vneg_h(reg6);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+ vec0 = __lsx_vsub_h(reg0, reg6);
+ reg0 = __lsx_vadd_h(reg0, reg6);
+ vec1 = __lsx_vsub_h(reg7, reg1);
+ reg7 = __lsx_vadd_h(reg7, reg1);
+
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+ /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+ LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc0, tmp_eve_buf, 240);
+ __lsx_vst(loc1, tmp_eve_buf, 0);
+ __lsx_vst(loc2, tmp_eve_buf, 224);
+ __lsx_vst(loc3, tmp_eve_buf, 16);
+
+ LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc0, tmp_eve_buf, 208);
+ __lsx_vst(loc1, tmp_eve_buf, 32);
+ __lsx_vst(loc2, tmp_eve_buf, 192);
+ __lsx_vst(loc3, tmp_eve_buf, 48);
+
+ /* Store 8 */
+ LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc0, tmp_eve_buf, 176);
+ __lsx_vst(loc1, tmp_eve_buf, 64);
+ __lsx_vst(loc2, tmp_eve_buf, 160);
+ __lsx_vst(loc3, tmp_eve_buf, 80);
+
+ LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc0, tmp_eve_buf, 144);
+ __lsx_vst(loc1, tmp_eve_buf, 96);
+ __lsx_vst(loc2, tmp_eve_buf, 128);
+ __lsx_vst(loc3, tmp_eve_buf, 112);
+}
+
+static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
+ int16_t *tmp_odd_buf) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+ /* Odd stage 1 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 16, tmp_buf, 112, tmp_buf, 144, tmp_buf, 240,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 272, tmp_buf, 368, tmp_buf, 400, tmp_buf, 496,
+ reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+ vec0 = __lsx_vadd_h(reg0, reg3);
+ reg0 = __lsx_vsub_h(reg0, reg3);
+ reg3 = __lsx_vadd_h(reg7, reg4);
+ reg7 = __lsx_vsub_h(reg7, reg4);
+ reg4 = __lsx_vadd_h(reg1, reg2);
+ reg1 = __lsx_vsub_h(reg1, reg2);
+ reg2 = __lsx_vadd_h(reg6, reg5);
+ reg6 = __lsx_vsub_h(reg6, reg5);
+ reg5 = vec0;
+
+ /* 4 Stores */
+ DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 64);
+ __lsx_vst(vec1, tmp_odd_buf, 80);
+
+ DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 0);
+ __lsx_vst(vec1, tmp_odd_buf, 16);
+
+ /* 4 Stores */
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+ LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+ __lsx_vst(vec0, tmp_odd_buf, 96);
+ __lsx_vst(vec1, tmp_odd_buf, 112);
+
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ __lsx_vst(vec2, tmp_odd_buf, 32);
+ __lsx_vst(vec3, tmp_odd_buf, 48);
+
+ /* Odd stage 2 */
+ /* 8 loads */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 48, tmp_buf, 80, tmp_buf, 176, tmp_buf, 208,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 304, tmp_buf, 336, tmp_buf, 432, tmp_buf, 464,
+ reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+ /* 4 Stores */
+ DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+ vec1, vec2, vec3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+ LSX_BUTTERFLY_4_H(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
+ __lsx_vst(vec0, tmp_odd_buf, 192);
+ __lsx_vst(vec1, tmp_odd_buf, 240);
+
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 160);
+ __lsx_vst(vec1, tmp_odd_buf, 176);
+
+ /* 4 Stores */
+ DUP4_ARG2(__lsx_vadd_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1,
+ vec2, vec0, vec3);
+ LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+ __lsx_vst(reg0, tmp_odd_buf, 208);
+ __lsx_vst(reg1, tmp_odd_buf, 224);
+
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+ __lsx_vst(reg0, tmp_odd_buf, 128);
+ __lsx_vst(reg1, tmp_odd_buf, 144);
+
+ /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+ tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+ tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+ loc1, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 0);
+ __lsx_vst(loc1, tmp_odd_buf, 16);
+ __lsx_vst(loc2, tmp_odd_buf, 32);
+ __lsx_vst(loc3, tmp_odd_buf, 48);
+
+ DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 128);
+ __lsx_vst(loc1, tmp_odd_buf, 144);
+ __lsx_vst(loc2, tmp_odd_buf, 160);
+ __lsx_vst(loc3, tmp_odd_buf, 176);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+ tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+ tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+ loc1, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 64);
+ __lsx_vst(loc1, tmp_odd_buf, 80);
+ __lsx_vst(loc2, tmp_odd_buf, 96);
+ __lsx_vst(loc3, tmp_odd_buf, 112);
+
+ DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+ DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 192);
+ __lsx_vst(loc1, tmp_odd_buf, 208);
+ __lsx_vst(loc2, tmp_odd_buf, 224);
+ __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct_butterfly_transpose_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf,
+ int16_t *tmp_odd_buf, int16_t *dst) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+ __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+ __m128i reg0, reg1, reg2, reg3;
+
+ /* FINAL BUTTERFLY : Dependency on Even & Odd */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+ tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+ tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+ m4, m2, m6);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+ reg1, reg2, reg3);
+ __lsx_vst(reg0, tmp_buf, 496);
+ __lsx_vst(reg1, tmp_buf, 368);
+ __lsx_vst(reg2, tmp_buf, 432);
+ __lsx_vst(reg3, tmp_buf, 304);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+ tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+ tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+ m5, m3, m7);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+ reg1, reg2, reg3);
+ __lsx_vst(reg0, tmp_buf, 464);
+ __lsx_vst(reg1, tmp_buf, 336);
+ __lsx_vst(reg2, tmp_buf, 400);
+ __lsx_vst(reg3, tmp_buf, 272);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+ tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+ tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+ n4, n2, n6);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+ reg1, reg2, reg3);
+ __lsx_vst(reg0, tmp_buf, 480);
+ __lsx_vst(reg1, tmp_buf, 352);
+ __lsx_vst(reg2, tmp_buf, 416);
+ __lsx_vst(reg3, tmp_buf, 288);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+ tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+ tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+ n5, n3, n7);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+ reg1, reg2, reg3);
+ __lsx_vst(reg0, tmp_buf, 448);
+ __lsx_vst(reg1, tmp_buf, 320);
+ __lsx_vst(reg2, tmp_buf, 384);
+ __lsx_vst(reg3, tmp_buf, 256);
+
+ /* Transpose : 16 vectors */
+ /* 1st & 2nd 8x8 */
+ LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ __lsx_vst(m0, dst, 0);
+ __lsx_vst(n0, dst, 64);
+ __lsx_vst(m1, dst, 128);
+ __lsx_vst(n1, dst, 192);
+ __lsx_vst(m2, dst, 256);
+ __lsx_vst(n2, dst, 320);
+ __lsx_vst(m3, dst, 384);
+ __lsx_vst(n3, dst, 448);
+
+ LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ __lsx_vst(m4, dst, 16);
+ __lsx_vst(n4, dst, 80);
+ __lsx_vst(m5, dst, 144);
+ __lsx_vst(n5, dst, 208);
+ __lsx_vst(m6, dst, 272);
+ __lsx_vst(n6, dst, 336);
+ __lsx_vst(m7, dst, 400);
+ __lsx_vst(n7, dst, 464);
+
+ /* 3rd & 4th 8x8 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 272, tmp_buf, 288, tmp_buf, 304,
+ m0, n0, m1, n1);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 320, tmp_buf, 336, tmp_buf, 352, tmp_buf, 368,
+ m2, n2, m3, n3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 384, tmp_buf, 400, tmp_buf, 416, tmp_buf, 432,
+ m4, n4, m5, n5);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 448, tmp_buf, 464, tmp_buf, 480, tmp_buf, 496,
+ m6, n6, m7, n7);
+ LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ __lsx_vst(m0, dst, 32);
+ __lsx_vst(n0, dst, 96);
+ __lsx_vst(m1, dst, 160);
+ __lsx_vst(n1, dst, 224);
+ __lsx_vst(m2, dst, 288);
+ __lsx_vst(n2, dst, 352);
+ __lsx_vst(m3, dst, 416);
+ __lsx_vst(n3, dst, 480);
+ __lsx_vst(m4, dst, 48);
+ __lsx_vst(n4, dst, 112);
+ __lsx_vst(m5, dst, 176);
+ __lsx_vst(n5, dst, 240);
+ __lsx_vst(m6, dst, 304);
+ __lsx_vst(n6, dst, 368);
+ __lsx_vst(m7, dst, 432);
+ __lsx_vst(n7, dst, 496);
+}
+
+static void idct32x8_1d_rows_lsx(const int16_t *input, int16_t *output) {
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+ DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+ DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+ idct32x8_row_transpose_store(input, &tmp_buf[0]);
+ idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+ idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+ idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
+ output);
+}
+
+static void idct8x32_column_even_process_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+ __m128i tmp0;
+
+ /* Even stage 1 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+ 1792, reg4, reg5, reg6, reg7);
+ tmp_buf += 64;
+
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+ LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ loc1 = vec3;
+ loc0 = vec1;
+
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+ LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+ LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+ LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+ /* Even stage 2 */
+ /* Load 8 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+ 1792, reg4, reg5, reg6, reg7);
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+ vec0 = __lsx_vadd_h(reg0, reg4);
+ reg0 = __lsx_vsub_h(reg0, reg4);
+ reg4 = __lsx_vadd_h(reg6, reg2);
+ reg6 = __lsx_vsub_h(reg6, reg2);
+ reg2 = __lsx_vadd_h(reg1, reg5);
+ reg1 = __lsx_vsub_h(reg1, reg5);
+ reg5 = __lsx_vadd_h(reg7, reg3);
+ reg7 = __lsx_vsub_h(reg7, reg3);
+ reg3 = vec0;
+
+ vec1 = reg2;
+ reg2 = __lsx_vadd_h(reg3, reg4);
+ reg3 = __lsx_vsub_h(reg3, reg4);
+ reg4 = __lsx_vsub_h(reg5, vec1);
+ reg5 = __lsx_vadd_h(reg5, vec1);
+
+ tmp0 = __lsx_vneg_h(reg6);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+ vec0 = __lsx_vsub_h(reg0, reg6);
+ reg0 = __lsx_vadd_h(reg0, reg6);
+ vec1 = __lsx_vsub_h(reg7, reg1);
+ reg7 = __lsx_vadd_h(reg7, reg1);
+
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+ /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+ /* Store 8 */
+ LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc1, tmp_eve_buf, 0);
+ __lsx_vst(loc3, tmp_eve_buf, 16);
+ __lsx_vst(loc2, tmp_eve_buf, 224);
+ __lsx_vst(loc0, tmp_eve_buf, 240);
+
+ LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc1, tmp_eve_buf, 32);
+ __lsx_vst(loc3, tmp_eve_buf, 48);
+ __lsx_vst(loc2, tmp_eve_buf, 192);
+ __lsx_vst(loc0, tmp_eve_buf, 208);
+
+ /* Store 8 */
+ LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc1, tmp_eve_buf, 64);
+ __lsx_vst(loc3, tmp_eve_buf, 80);
+ __lsx_vst(loc2, tmp_eve_buf, 160);
+ __lsx_vst(loc0, tmp_eve_buf, 176);
+
+ LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc1, tmp_eve_buf, 96);
+ __lsx_vst(loc3, tmp_eve_buf, 112);
+ __lsx_vst(loc2, tmp_eve_buf, 128);
+ __lsx_vst(loc0, tmp_eve_buf, 144);
+}
+
+static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
+ int16_t *tmp_odd_buf) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+ /* Odd stage 1 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 448, tmp_buf, 576, tmp_buf, 960,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 1088, tmp_buf, 1472, tmp_buf, 1600, tmp_buf,
+ 1984, reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+ vec0 = __lsx_vadd_h(reg0, reg3);
+ reg0 = __lsx_vsub_h(reg0, reg3);
+ reg3 = __lsx_vadd_h(reg7, reg4);
+ reg7 = __lsx_vsub_h(reg7, reg4);
+ reg4 = __lsx_vadd_h(reg1, reg2);
+ reg1 = __lsx_vsub_h(reg1, reg2);
+ reg2 = __lsx_vadd_h(reg6, reg5);
+ reg6 = __lsx_vsub_h(reg6, reg5);
+ reg5 = vec0;
+
+ /* 4 Stores */
+ DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 64);
+ __lsx_vst(vec1, tmp_odd_buf, 80);
+ DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 0);
+ __lsx_vst(vec1, tmp_odd_buf, 16);
+
+ /* 4 Stores */
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+ LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ __lsx_vst(vec0, tmp_odd_buf, 96);
+ __lsx_vst(vec1, tmp_odd_buf, 112);
+ __lsx_vst(vec2, tmp_odd_buf, 32);
+ __lsx_vst(vec3, tmp_odd_buf, 48);
+
+ /* Odd stage 2 */
+ /* 8 loads */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 192, tmp_buf, 320, tmp_buf, 704, tmp_buf, 832,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 1216, tmp_buf, 1344, tmp_buf, 1728, tmp_buf,
+ 1856, reg4, reg5, reg6, reg7);
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+ /* 4 Stores */
+ DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+ vec1, vec2, vec3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+ LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+ __lsx_vst(vec0, tmp_odd_buf, 192);
+ __lsx_vst(vec1, tmp_odd_buf, 240);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 160);
+ __lsx_vst(vec1, tmp_odd_buf, 176);
+
+ /* 4 Stores */
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0,
+ vec1, vec2, vec3);
+ LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+ __lsx_vst(reg0, tmp_odd_buf, 208);
+ __lsx_vst(reg1, tmp_odd_buf, 224);
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+ __lsx_vst(reg0, tmp_odd_buf, 128);
+ __lsx_vst(reg1, tmp_odd_buf, 144);
+
+ /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+ tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+ tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+ loc1, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 0);
+ __lsx_vst(loc1, tmp_odd_buf, 16);
+ __lsx_vst(loc2, tmp_odd_buf, 32);
+ __lsx_vst(loc3, tmp_odd_buf, 48);
+
+ DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+ DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 128);
+ __lsx_vst(loc1, tmp_odd_buf, 144);
+ __lsx_vst(loc2, tmp_odd_buf, 160);
+ __lsx_vst(loc3, tmp_odd_buf, 176);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+ tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+ tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+ loc1, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 64);
+ __lsx_vst(loc1, tmp_odd_buf, 80);
+ __lsx_vst(loc2, tmp_odd_buf, 96);
+ __lsx_vst(loc3, tmp_odd_buf, 112);
+
+ DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+ DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 192);
+ __lsx_vst(loc1, tmp_odd_buf, 208);
+ __lsx_vst(loc2, tmp_odd_buf, 224);
+ __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+ int16_t *tmp_odd_buf, uint8_t *dst,
+ int32_t dst_stride) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+ __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+ int32_t stride = dst_stride << 2;
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride + stride2;
+
+ /* FINAL BUTTERFLY : Dependency on Even & Odd */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+ tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+ tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+ m4, m2, m6);
+ DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+ VP9_ADDBLK_ST8x4_UB(dst, stride, stride2, stride3, m0, m2, m4, m6);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6,
+ m2, m4, m0);
+ DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+ VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), stride, stride2, stride3, m0, m2,
+ m4, m6);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+ tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+ tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+ m5, m3, m7);
+ DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+ VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), stride, stride2, stride3, m1, m3,
+ m5, m7);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7,
+ m3, m5, m1);
+ DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+ VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), stride, stride2, stride3, m1, m3,
+ m5, m7);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+ tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+ tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+ n4, n2, n6);
+ DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+ VP9_ADDBLK_ST8x4_UB((dst + dst_stride), stride, stride2, stride3, n0, n2, n4,
+ n6);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6,
+ n2, n4, n0);
+ DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+ VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), stride, stride2, stride3, n0, n2,
+ n4, n6);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+ tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+ tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+ n5, n3, n7);
+ DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+ VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), stride, stride2, stride3, n1, n3,
+ n5, n7);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7,
+ n3, n5, n1);
+ DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+ VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), stride, stride2, stride3, n1, n3,
+ n5, n7);
+}
+
+static void idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+ DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+ idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+ idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+ idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
+ dst_stride);
+}
+
+void vpx_idct32x32_1024_add_lsx(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+ int16_t *out_ptr = out_arr;
+
+ /* transform rows */
+ for (i = 0; i < 4; ++i) {
+ /* process 32 * 8 block */
+ idct32x8_1d_rows_lsx((input + (i << 8)), (out_ptr + (i << 8)));
+ }
+
+ for (i = 0; i < 4; ++i) {
+ /* process 8 * 32 block */
+ idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void vpx_idct32x32_34_add_lsx(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+ int16_t *out_ptr = out_arr;
+ __m128i zero = __lsx_vldi(0);
+
+ for (i = 32; i--;) {
+ __lsx_vst(zero, out_ptr, 0);
+ __lsx_vst(zero, out_ptr, 16);
+ __lsx_vst(zero, out_ptr, 32);
+ __lsx_vst(zero, out_ptr, 48);
+ out_ptr += 32;
+ }
+
+ out_ptr = out_arr;
+
+ /* rows: only upper-left 8x8 has non-zero coeff */
+ idct32x8_1d_rows_lsx(input, out_ptr);
+
+ /* transform columns */
+ for (i = 0; i < 4; ++i) {
+ /* process 8 * 32 block */
+ idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void vpx_idct32x32_1_add_lsx(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ int16_t out;
+ __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+ out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO(out, 6);
+
+ vec = __lsx_vreplgr2vr_h(out);
+
+ for (i = 16; i--;) {
+ DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+ dst2 = __lsx_vldx(dst, dst_stride);
+ dst3 = __lsx_vldx(dst + 16, dst_stride);
+
+ UNPCK_UB_SH(dst0, res0, res4);
+ UNPCK_UB_SH(dst1, res1, res5);
+ UNPCK_UB_SH(dst2, res2, res6);
+ UNPCK_UB_SH(dst3, res3, res7);
+
+ DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+ res1, res2, res3);
+ DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, res4,
+ res5, res6, res7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, res4, res0, 0, res5, res1, 0, res6, res2, 0,
+ res7, res3, 0, tmp0, tmp1, tmp2, tmp3);
+ __lsx_vst(tmp0, dst, 0);
+ __lsx_vst(tmp1, dst, 16);
+ dst += dst_stride;
+ __lsx_vst(tmp2, dst, 0);
+ __lsx_vst(tmp3, dst, 16);
+ dst += dst_stride;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c
new file mode 100644
index 0000000000..f990211791
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static inline void intra_predict_dc_8x8_lsx(const uint8_t *src_top,
+ const uint8_t *src_left,
+ uint8_t *dst, int32_t dst_stride) {
+ uint64_t val0, val1;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i store, sum_h, sum_w, sum_d;
+ __m128i src = { 0 };
+
+ val0 = *(const uint64_t *)src_top;
+ val1 = *(const uint64_t *)src_left;
+ DUP2_ARG3(__lsx_vinsgr2vr_d, src, val0, 0, src, val1, 1, src, src);
+ sum_h = __lsx_vhaddw_hu_bu(src, src);
+ sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+ sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+ sum_w = __lsx_vpickev_w(sum_d, sum_d);
+ sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+ sum_w = __lsx_vsrari_w(sum_d, 4);
+ store = __lsx_vreplvei_b(sum_w, 0);
+
+ __lsx_vstelm_d(store, dst, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+ dst += dst_stride_x4;
+ __lsx_vstelm_d(store, dst, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+}
+
+static inline void intra_predict_dc_16x16_lsx(const uint8_t *src_top,
+ const uint8_t *src_left,
+ uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i top, left, out;
+ __m128i sum_h, sum_top, sum_left;
+ __m128i sum_w;
+ __m128i sum_d;
+
+ DUP2_ARG2(__lsx_vld, src_top, 0, src_left, 0, top, left);
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, top, top, left, left, sum_top, sum_left);
+ sum_h = __lsx_vadd_h(sum_top, sum_left);
+ sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+ sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+ sum_w = __lsx_vpickev_w(sum_d, sum_d);
+ sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+ sum_w = __lsx_vsrari_w(sum_d, 5);
+ out = __lsx_vreplvei_b(sum_w, 0);
+
+ __lsx_vstx(out, dst, 0);
+ __lsx_vstx(out, dst, dst_stride);
+ __lsx_vstx(out, dst, dst_stride_x2);
+ __lsx_vstx(out, dst, dst_stride_x3);
+ dst += dst_stride_x4;
+ __lsx_vstx(out, dst, 0);
+ __lsx_vstx(out, dst, dst_stride);
+ __lsx_vstx(out, dst, dst_stride_x2);
+ __lsx_vstx(out, dst, dst_stride_x3);
+ dst += dst_stride_x4;
+ __lsx_vstx(out, dst, 0);
+ __lsx_vstx(out, dst, dst_stride);
+ __lsx_vstx(out, dst, dst_stride_x2);
+ __lsx_vstx(out, dst, dst_stride_x3);
+ dst += dst_stride_x4;
+ __lsx_vstx(out, dst, 0);
+ __lsx_vstx(out, dst, dst_stride);
+ __lsx_vstx(out, dst, dst_stride_x2);
+ __lsx_vstx(out, dst, dst_stride_x3);
+}
+
+void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_8x8_lsx(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_16x16_lsx(above, left, dst, y_stride);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c
new file mode 100644
index 0000000000..0503df9966
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+#include "vpx_ports/mem.h"
+
+#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \
+ _in2, _in3, _in4, _in5, _in6, _in7) \
+ do { \
+ _in0 = __lsx_vld(_src, 0); \
+ _in1 = __lsx_vldx(_src, _stride); \
+ _in2 = __lsx_vldx(_src, _stride2); \
+ _in3 = __lsx_vldx(_src, _stride3); \
+ _src += _stride4; \
+ _in4 = __lsx_vld(_src, 0); \
+ _in5 = __lsx_vldx(_src, _stride); \
+ _in6 = __lsx_vldx(_src, _stride2); \
+ _in7 = __lsx_vldx(_src, _stride3); \
+ } while (0)
+
+#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \
+ _stride, _stride2, _stride3, _stride4) \
+ do { \
+ __lsx_vst(_dst0, _dst, 0); \
+ __lsx_vstx(_dst1, _dst, _stride); \
+ __lsx_vstx(_dst2, _dst, _stride2); \
+ __lsx_vstx(_dst3, _dst, _stride3); \
+ _dst += _stride4; \
+ __lsx_vst(_dst4, _dst, 0); \
+ __lsx_vstx(_dst5, _dst, _stride); \
+ __lsx_vstx(_dst6, _dst, _stride2); \
+ __lsx_vstx(_dst7, _dst, _stride3); \
+ } while (0)
+
+static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride,
+ uint8_t *filter48,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ __m128i flat, mask, hev, thresh, b_limit, limit;
+ __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+ __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+ __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+ __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+
+ /* load vector elements */
+ DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+ -stride, p3, p2, p1, p0);
+
+ q0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+ q3 = __lsx_vldx(dst, stride3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__lsx_bz_v(flat)) {
+ __lsx_vstx(p1_out, dst, -stride2);
+ __lsx_vstx(p0_out, dst, -stride);
+ __lsx_vst(q0_out, dst, 0);
+ __lsx_vstx(q1_out, dst, stride);
+
+ return 1;
+ }
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+ p0_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+ q3_l);
+
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+ DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+ VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+ p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+ p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l);
+ DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+ q1_filt8_l, q2_filt8_l);
+
+ /* store pixel values */
+ DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filt8_l, flat, p1_out, p1_filt8_l, flat,
+ p0_out, p0_filt8_l, flat, q0_out, q0_filt8_l, flat, p2_out, p1_out,
+ p0_out, q0_out);
+ DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filt8_l, flat, q2, q2_filt8_l, flat,
+ q1_out, q2_out);
+
+ __lsx_vst(p2_out, filter48, 0);
+ __lsx_vst(p1_out, filter48, 16);
+ __lsx_vst(p0_out, filter48, 32);
+ __lsx_vst(q0_out, filter48, 48);
+ __lsx_vst(q1_out, filter48, 64);
+ __lsx_vst(q2_out, filter48, 80);
+ __lsx_vst(flat, filter48, 96);
+
+ return 0;
+}
+
+static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+ uint8_t *dst_tmp0 = dst - stride4;
+ uint8_t *dst_tmp1 = dst + stride4;
+
+ __m128i flat, flat2, filter8;
+ __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ __m128i out_h, out_l;
+ v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+ v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+ v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+ v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+ v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+ v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+ v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+ v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+ v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+
+ flat = __lsx_vld(filter48, 96);
+
+ DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+ -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+
+ p3 = __lsx_vld(dst_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp0, stride, dst_tmp0, stride2, p2, p1);
+ p0 = __lsx_vldx(dst_tmp0, stride3);
+
+ q0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+ q3 = __lsx_vldx(dst, stride3);
+
+ q4 = __lsx_vld(dst_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+ q7 = __lsx_vldx(dst_tmp1, stride3);
+
+ VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__lsx_bz_v(flat2)) {
+ DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+ p2, p1, p0, q0);
+ DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+ __lsx_vstx(p2, dst, -stride3);
+ __lsx_vstx(p1, dst, -stride2);
+ __lsx_vstx(p0, dst, -stride);
+ __lsx_vst(q0, dst, 0);
+ __lsx_vstx(q1, dst, stride);
+ __lsx_vstx(q2, dst, stride2);
+ } else {
+ dst = dst_tmp0 - stride3;
+
+ p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+ p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+ p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+ p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+ p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+ p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+ p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+ p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+ q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
+
+ tmp0_l = p7_l_in << 3;
+ tmp0_l -= p7_l_in;
+ tmp0_l += p6_l_in;
+ tmp0_l += q0_l_in;
+ tmp1_l = p6_l_in + p5_l_in;
+ tmp1_l += p4_l_in;
+ tmp1_l += p3_l_in;
+ tmp1_l += p2_l_in;
+ tmp1_l += p1_l_in;
+ tmp1_l += p0_l_in;
+ tmp1_l += tmp0_l;
+
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+ p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+ p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+ p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+ p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+ p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+ p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+ p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+ q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
+
+ tmp0_h = p7_h_in << 3;
+ tmp0_h -= p7_h_in;
+ tmp0_h += p6_h_in;
+ tmp0_h += q0_h_in;
+ tmp1_h = p6_h_in + p5_h_in;
+ tmp1_h += p4_h_in;
+ tmp1_h += p3_h_in;
+ tmp1_h += p2_h_in;
+ tmp1_h += p1_h_in;
+ tmp1_h += p0_h_in;
+ tmp1_h += tmp0_h;
+
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+ __lsx_vst(p6, dst, 0);
+ dst += stride;
+
+ /* p5 */
+ q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
+ tmp0_l = p5_l_in - p6_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
+ tmp0_h = p5_h_in - p6_h_in;
+ tmp0_h += q1_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+ __lsx_vst(p5, dst, 0);
+ dst += stride;
+
+ /* p4 */
+ q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
+ tmp0_l = p4_l_in - p5_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
+ tmp0_h = p4_h_in - p5_h_in;
+ tmp0_h += q2_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+ __lsx_vst(p4, dst, 0);
+ dst += stride;
+
+ /* p3 */
+ q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
+ tmp0_l = p3_l_in - p4_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
+ tmp0_h = p3_h_in - p4_h_in;
+ tmp0_h += q3_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+ __lsx_vst(p3, dst, 0);
+ dst += stride;
+
+ /* p2 */
+ q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
+ filter8 = __lsx_vld(filter48, 0);
+ tmp0_l = p2_l_in - p3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
+ tmp0_h = p2_h_in - p3_h_in;
+ tmp0_h += q4_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* p1 */
+ q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
+ filter8 = __lsx_vld(filter48, 16);
+ tmp0_l = p1_l_in - p2_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
+ tmp0_h = p1_h_in - p2_h_in;
+ tmp0_h += q5_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* p0 */
+ q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
+ filter8 = __lsx_vld(filter48, 32);
+ tmp0_l = p0_l_in - p1_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
+ tmp0_h = p0_h_in - p1_h_in;
+ tmp0_h += q6_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* q0 */
+ q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
+ filter8 = __lsx_vld(filter48, 48);
+ tmp0_l = q7_l_in - p0_l_in;
+ tmp0_l += q0_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
+ tmp0_h = q7_h_in - p0_h_in;
+ tmp0_h += q0_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* q1 */
+ filter8 = __lsx_vld(filter48, 64);
+ tmp0_l = q7_l_in - q0_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p6_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q0_h_in;
+ tmp0_h += q1_h_in;
+ tmp0_h -= p6_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* q2 */
+ filter8 = __lsx_vld(filter48, 80);
+ tmp0_l = q7_l_in - q1_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p5_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q1_h_in;
+ tmp0_h += q2_h_in;
+ tmp0_h -= p5_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* q3 */
+ tmp0_l = q7_l_in - q2_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p4_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q2_h_in;
+ tmp0_h += q3_h_in;
+ tmp0_h -= p4_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+ __lsx_vst(q3, dst, 0);
+ dst += stride;
+
+ /* q4 */
+ tmp0_l = q7_l_in - q3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p3_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q3_h_in;
+ tmp0_h += q4_h_in;
+ tmp0_h -= p3_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+ __lsx_vst(q4, dst, 0);
+ dst += stride;
+
+ /* q5 */
+ tmp0_l = q7_l_in - q4_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p2_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q4_h_in;
+ tmp0_h += q5_h_in;
+ tmp0_h -= p2_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+ __lsx_vst(q5, dst, 0);
+ dst += stride;
+
+ /* q6 */
+ tmp0_l = q7_l_in - q5_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p1_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q5_h_in;
+ tmp0_h += q6_h_in;
+ tmp0_h -= p1_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+ __lsx_vst(q6, dst, 0);
+ }
+}
+
+static void mb_lpf_horizontal_edge_dual(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ DECLARE_ALIGNED(16, uint8_t, filter48[16 * 8]);
+ uint8_t early_exit = 0;
+
+ early_exit = hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0], b_limit_ptr,
+ limit_ptr, thresh_ptr);
+
+ if (early_exit == 0) {
+ hz_lpf_t16_16w(dst, stride, filter48);
+ }
+}
+
+static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr, int32_t count) {
+ if (count == 1) {
+ __m128i flat2, mask, hev, flat, thresh, b_limit, limit;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+ __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ __m128i p0_filter16, p1_filter16;
+ __m128i p2_filter8, p1_filter8, p0_filter8;
+ __m128i q0_filter8, q1_filter8, q2_filter8;
+ __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l;
+ __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+ __m128i zero = __lsx_vldi(0);
+ __m128i tmp0, tmp1, tmp2;
+
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = 2 + stride;
+ int32_t stride4 = stride << 2;
+ uint8_t *dst_tmp0 = dst - stride4;
+ uint8_t *dst_tmp1 = dst + stride4;
+
+ /* load vector elements */
+ DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+ -stride, p3, p2, p1, p0);
+ q0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+ q3 = __lsx_vldx(dst, stride3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ /* filter_mask* */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+ q1_out);
+ flat = __lsx_vilvl_d(zero, flat);
+ if (__lsx_bz_v(flat)) {
+ __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+ __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+ __lsx_vstelm_d(q0_out, dst, 0, 0);
+ __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+ } else {
+ /* convert 8 bit input data into 16 bit */
+ DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l,
+ p2_l, p1_l, p0_l);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l,
+ q1_l, q2_l, q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+ p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero,
+ p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8,
+ p0_filter8, q0_filter8);
+ DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8,
+ q2_filter8);
+
+ /* store pixel values */
+ p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
+ p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
+ p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
+ q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
+ q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
+ q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
+
+ /* load 16 vector elements */
+ DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+ -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+ q4 = __lsx_vld(dst_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+ q7 = __lsx_vldx(dst_tmp1, stride3);
+
+ VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__lsx_bz_v(flat2)) {
+ dst -= stride3;
+ __lsx_vstelm_d(p2_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p0_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(q0_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(q1_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(q2_out, dst, 0, 0);
+ } else {
+ /* LSB(right) 8 pixel operation */
+ DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4, p7_l,
+ p6_l, p5_l, p4_l);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7, q4_l,
+ q5_l, q6_l, q7_l);
+
+ tmp0 = __lsx_vslli_h(p7_l, 3);
+ tmp0 = __lsx_vsub_h(tmp0, p7_l);
+ tmp0 = __lsx_vadd_h(tmp0, p6_l);
+ tmp0 = __lsx_vadd_h(tmp0, q0_l);
+
+ dst = dst_tmp0 - stride3;
+
+ /* calculation of p6 and p5 */
+ tmp1 = __lsx_vadd_h(p6_l, p5_l);
+ tmp1 = __lsx_vadd_h(tmp1, p4_l);
+ tmp1 = __lsx_vadd_h(tmp1, p3_l);
+ tmp1 = __lsx_vadd_h(tmp1, p2_l);
+ tmp1 = __lsx_vadd_h(tmp1, p1_l);
+ tmp1 = __lsx_vadd_h(tmp1, p0_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp0 = __lsx_vsub_h(p5_l, p6_l);
+ tmp0 = __lsx_vadd_h(tmp0, q1_l);
+ tmp0 = __lsx_vsub_h(tmp0, p7_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, p6, p0_filter16, flat2, p5, p1_filter16,
+ flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of p4 and p3 */
+ tmp0 = __lsx_vsub_h(p4_l, p5_l);
+ tmp0 = __lsx_vadd_h(tmp0, q2_l);
+ tmp0 = __lsx_vsub_h(tmp0, p7_l);
+ tmp2 = __lsx_vsub_h(p3_l, p4_l);
+ tmp2 = __lsx_vadd_h(tmp2, q3_l);
+ tmp2 = __lsx_vsub_h(tmp2, p7_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, p4, p0_filter16, flat2, p3, p1_filter16,
+ flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of p2 and p1 */
+ tmp0 = __lsx_vsub_h(p2_l, p3_l);
+ tmp0 = __lsx_vadd_h(tmp0, q4_l);
+ tmp0 = __lsx_vsub_h(tmp0, p7_l);
+ tmp2 = __lsx_vsub_h(p1_l, p2_l);
+ tmp2 = __lsx_vadd_h(tmp2, q5_l);
+ tmp2 = __lsx_vsub_h(tmp2, p7_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, p2_out, p0_filter16, flat2, p1_out,
+ p1_filter16, flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of p0 and q0 */
+ tmp0 = __lsx_vsub_h(p0_l, p1_l);
+ tmp0 = __lsx_vadd_h(tmp0, q6_l);
+ tmp0 = __lsx_vsub_h(tmp0, p7_l);
+ tmp2 = __lsx_vsub_h(q7_l, p0_l);
+ tmp2 = __lsx_vadd_h(tmp2, q0_l);
+ tmp2 = __lsx_vsub_h(tmp2, p7_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, p0_out, p0_filter16, flat2, q0_out,
+ p1_filter16, flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of q1 and q2 */
+ tmp0 = __lsx_vsub_h(q7_l, q0_l);
+ tmp0 = __lsx_vadd_h(tmp0, q1_l);
+ tmp0 = __lsx_vsub_h(tmp0, p6_l);
+ tmp2 = __lsx_vsub_h(q7_l, q1_l);
+ tmp2 = __lsx_vadd_h(tmp2, q2_l);
+ tmp2 = __lsx_vsub_h(tmp2, p5_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, q1_out, p0_filter16, flat2, q2_out,
+ p1_filter16, flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of q3 and q4 */
+ tmp0 = __lsx_vsub_h(q7_l, q2_l);
+ tmp0 = __lsx_vadd_h(tmp0, q3_l);
+ tmp0 = __lsx_vsub_h(tmp0, p4_l);
+ tmp2 = __lsx_vsub_h(q7_l, q3_l);
+ tmp2 = __lsx_vadd_h(tmp2, q4_l);
+ tmp2 = __lsx_vsub_h(tmp2, p3_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, q3, p0_filter16, flat2, q4, p1_filter16,
+ flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of q5 and q6 */
+ tmp0 = __lsx_vsub_h(q7_l, q4_l);
+ tmp0 = __lsx_vadd_h(tmp0, q5_l);
+ tmp0 = __lsx_vsub_h(tmp0, p2_l);
+ tmp2 = __lsx_vsub_h(q7_l, q5_l);
+ tmp2 = __lsx_vadd_h(tmp2, q6_l);
+ tmp2 = __lsx_vsub_h(tmp2, p1_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, q5, p0_filter16, flat2, q6, p1_filter16,
+ flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ }
+ }
+ } else {
+ mb_lpf_horizontal_edge_dual(dst, stride, b_limit_ptr, limit_ptr,
+ thresh_ptr);
+ }
+}
+
+void vpx_lpf_horizontal_16_dual_lsx(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ mb_lpf_horizontal_edge(dst, stride, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output,
+ int32_t out_stride) {
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+ __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+ __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+ __m128i tmp2, tmp3;
+ __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ int32_t in_stride2 = in_stride << 1;
+ int32_t in_stride3 = in_stride2 + in_stride;
+ int32_t in_stride4 = in_stride2 << 1;
+ int32_t out_stride2 = out_stride << 1;
+ int32_t out_stride3 = out_stride2 + out_stride;
+ int32_t out_stride4 = out_stride2 << 1;
+
+ LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row0, row1,
+ row2, row3, row4, row5, row6, row7);
+ input += in_stride4;
+ LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row8, row9,
+ row10, row11, row12, row13, row14, row15);
+
+ LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p7, p6,
+ p5, p4, p3, p2, p1, p0);
+
+ /* transpose 16x8 matrix into 8x16 */
+ /* total 8 intermediate register and 32 instructions */
+ q7 = __lsx_vpackod_d(row8, row0);
+ q6 = __lsx_vpackod_d(row9, row1);
+ q5 = __lsx_vpackod_d(row10, row2);
+ q4 = __lsx_vpackod_d(row11, row3);
+ q3 = __lsx_vpackod_d(row12, row4);
+ q2 = __lsx_vpackod_d(row13, row5);
+ q1 = __lsx_vpackod_d(row14, row6);
+ q0 = __lsx_vpackod_d(row15, row7);
+
+ DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5);
+
+ DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7);
+ DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7);
+
+ DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3);
+ q0 = __lsx_vpackev_w(tmp3, tmp2);
+ q4 = __lsx_vpackod_w(tmp3, tmp2);
+
+ tmp2 = __lsx_vpackod_h(tmp1, tmp0);
+ tmp3 = __lsx_vpackod_h(q7, q5);
+ q2 = __lsx_vpackev_w(tmp3, tmp2);
+ q6 = __lsx_vpackod_w(tmp3, tmp2);
+
+ DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3);
+ q1 = __lsx_vpackev_w(tmp3, tmp2);
+ q5 = __lsx_vpackod_w(tmp3, tmp2);
+
+ tmp2 = __lsx_vpackod_h(tmp5, tmp4);
+ tmp3 = __lsx_vpackod_h(tmp7, tmp6);
+ q3 = __lsx_vpackev_w(tmp3, tmp2);
+ q7 = __lsx_vpackod_w(tmp3, tmp2);
+
+ LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride, out_stride2,
+ out_stride3, out_stride4);
+ output += out_stride4;
+ LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride, out_stride2,
+ out_stride3, out_stride4);
+}
+
+static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
+ uint8_t *dst_org, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ __m128i flat, mask, hev, thresh, b_limit, limit;
+ __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+ __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+ __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+ __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+
+ /* load vector elements */
+ DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0);
+ DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ /* if flat is zero for all pixels, then no need to calculate other filter */
+ if (__lsx_bz_v(flat)) {
+ DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ vec2 = __lsx_vilvl_h(vec1, vec0);
+ vec3 = __lsx_vilvh_h(vec1, vec0);
+ DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ vec4 = __lsx_vilvl_h(vec1, vec0);
+ vec5 = __lsx_vilvh_h(vec1, vec0);
+
+ dst_org -= 2;
+ __lsx_vstelm_w(vec2, dst_org, 0, 0);
+ __lsx_vstelm_w(vec2, dst_org + stride, 0, 1);
+ __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2);
+ __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3);
+ dst_org += stride4;
+ __lsx_vstelm_w(vec3, dst_org, 0, 0);
+ __lsx_vstelm_w(vec3, dst_org + stride, 0, 1);
+ __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2);
+ __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3);
+ dst_org += stride4;
+ __lsx_vstelm_w(vec4, dst_org, 0, 0);
+ __lsx_vstelm_w(vec4, dst_org + stride, 0, 1);
+ __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2);
+ __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3);
+ dst_org += stride4;
+ __lsx_vstelm_w(vec5, dst_org, 0, 0);
+ __lsx_vstelm_w(vec5, dst_org + stride, 0, 1);
+ __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2);
+ __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3);
+
+ return 1;
+ }
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+ p0_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+ q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+ DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+ DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+ VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+ p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+ p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l);
+ DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+ q1_filt8_l, q2_filt8_l);
+
+ /* store pixel values */
+ p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+ p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+ p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+ q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+ q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+ q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+ __lsx_vst(p2_out, filter48, 0);
+ __lsx_vst(p1_out, filter48, 16);
+ __lsx_vst(p0_out, filter48, 32);
+ __lsx_vst(q0_out, filter48, 48);
+ __lsx_vst(q1_out, filter48, 64);
+ __lsx_vst(q2_out, filter48, 80);
+ __lsx_vst(flat, filter48, 96);
+
+ return 0;
+}
+
+static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
+ uint8_t *filter48) {
+ __m128i flat, flat2, filter8;
+ __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ __m128i out_l, out_h;
+ v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+ v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+ v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+ v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+ v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+ v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+ v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+ v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+ v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+ uint8_t *dst_tmp = dst - 128;
+
+ flat = __lsx_vld(filter48, 96);
+
+ DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, p7,
+ p6, p5, p4);
+ DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, dst_tmp, 112, p3,
+ p2, p1, p0);
+ DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+ DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
+
+ VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+ /* if flat2 is zero for all pixels, then no need to calculate other filter */
+ if (__lsx_bz_v(flat2)) {
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+ DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+ p2, p1, p0, q0);
+ DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+
+ DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
+ vec3 = __lsx_vilvl_h(vec1, vec0);
+ vec4 = __lsx_vilvh_h(vec1, vec0);
+ DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
+ vec6 = __lsx_vilvl_h(vec1, vec0);
+ vec7 = __lsx_vilvh_h(vec1, vec0);
+ vec2 = __lsx_vilvl_b(q2, q1);
+ vec5 = __lsx_vilvh_b(q2, q1);
+
+ dst_org -= 3;
+ __lsx_vstelm_w(vec3, dst_org, 0, 0);
+ __lsx_vstelm_h(vec2, dst_org, 4, 0);
+ dst_org += stride;
+ __lsx_vstelm_w(vec3, dst_org, 0, 1);
+ __lsx_vstelm_h(vec2, dst_org, 4, 1);
+ dst_org += stride;
+ __lsx_vstelm_w(vec3, dst_org, 0, 2);
+ __lsx_vstelm_h(vec2, dst_org, 4, 2);
+ dst_org += stride;
+ __lsx_vstelm_w(vec3, dst_org, 0, 3);
+ __lsx_vstelm_h(vec2, dst_org, 4, 3);
+ dst_org += stride;
+ __lsx_vstelm_w(vec4, dst_org, 0, 0);
+ __lsx_vstelm_h(vec2, dst_org, 4, 4);
+ dst_org += stride;
+ __lsx_vstelm_w(vec4, dst_org, 0, 1);
+ __lsx_vstelm_h(vec2, dst_org, 4, 5);
+ dst_org += stride;
+ __lsx_vstelm_w(vec4, dst_org, 0, 2);
+ __lsx_vstelm_h(vec2, dst_org, 4, 6);
+ dst_org += stride;
+ __lsx_vstelm_w(vec4, dst_org, 0, 3);
+ __lsx_vstelm_h(vec2, dst_org, 4, 7);
+ dst_org += stride;
+ __lsx_vstelm_w(vec6, dst_org, 0, 0);
+ __lsx_vstelm_h(vec5, dst_org, 4, 0);
+ dst_org += stride;
+ __lsx_vstelm_w(vec6, dst_org, 0, 1);
+ __lsx_vstelm_h(vec5, dst_org, 4, 1);
+ dst_org += stride;
+ __lsx_vstelm_w(vec6, dst_org, 0, 2);
+ __lsx_vstelm_h(vec5, dst_org, 4, 2);
+ dst_org += stride;
+ __lsx_vstelm_w(vec6, dst_org, 0, 3);
+ __lsx_vstelm_h(vec5, dst_org, 4, 3);
+ dst_org += stride;
+ __lsx_vstelm_w(vec7, dst_org, 0, 0);
+ __lsx_vstelm_h(vec5, dst_org, 4, 4);
+ dst_org += stride;
+ __lsx_vstelm_w(vec7, dst_org, 0, 1);
+ __lsx_vstelm_h(vec5, dst_org, 4, 5);
+ dst_org += stride;
+ __lsx_vstelm_w(vec7, dst_org, 0, 2);
+ __lsx_vstelm_h(vec5, dst_org, 4, 6);
+ dst_org += stride;
+ __lsx_vstelm_w(vec7, dst_org, 0, 3);
+ __lsx_vstelm_h(vec5, dst_org, 4, 7);
+
+ return 1;
+ }
+
+ dst -= 7 * 16;
+
+ p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+ p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+ p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+ p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+ p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+ p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+ p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+ p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+ q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
+
+ tmp0_l = p7_l_in << 3;
+ tmp0_l -= p7_l_in;
+ tmp0_l += p6_l_in;
+ tmp0_l += q0_l_in;
+ tmp1_l = p6_l_in + p5_l_in;
+ tmp1_l += p4_l_in;
+ tmp1_l += p3_l_in;
+ tmp1_l += p2_l_in;
+ tmp1_l += p1_l_in;
+ tmp1_l += p0_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+ p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+ p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+ p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+ p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+ p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+ p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+ p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+ q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
+
+ tmp0_h = p7_h_in << 3;
+ tmp0_h -= p7_h_in;
+ tmp0_h += p6_h_in;
+ tmp0_h += q0_h_in;
+ tmp1_h = p6_h_in + p5_h_in;
+ tmp1_h += p4_h_in;
+ tmp1_h += p3_h_in;
+ tmp1_h += p2_h_in;
+ tmp1_h += p1_h_in;
+ tmp1_h += p0_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+ __lsx_vst(p6, dst, 0);
+
+ /* p5 */
+ q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
+ tmp0_l = p5_l_in - p6_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
+ tmp0_h = p5_h_in - p6_h_in;
+ tmp0_h += q1_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+ __lsx_vst(p5, dst, 16);
+
+ /* p4 */
+ q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
+ tmp0_l = p4_l_in - p5_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
+ tmp0_h = p4_h_in - p5_h_in;
+ tmp0_h += q2_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+ __lsx_vst(p4, dst, 16 * 2);
+
+ /* p3 */
+ q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
+ tmp0_l = p3_l_in - p4_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
+ tmp0_h = p3_h_in - p4_h_in;
+ tmp0_h += q3_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+ __lsx_vst(p3, dst, 16 * 3);
+
+ /* p2 */
+ q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
+ filter8 = __lsx_vld(filter48, 0);
+ tmp0_l = p2_l_in - p3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
+ tmp0_h = p2_h_in - p3_h_in;
+ tmp0_h += q4_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 4);
+
+ /* p1 */
+ q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
+ filter8 = __lsx_vld(filter48, 16);
+ tmp0_l = p1_l_in - p2_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
+ tmp0_h = p1_h_in - p2_h_in;
+ tmp0_h += q5_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 5);
+
+ /* p0 */
+ q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
+ filter8 = __lsx_vld(filter48, 32);
+ tmp0_l = p0_l_in - p1_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
+ tmp0_h = p0_h_in - p1_h_in;
+ tmp0_h += q6_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 6);
+
+ /* q0 */
+ q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
+ filter8 = __lsx_vld(filter48, 48);
+ tmp0_l = q7_l_in - p0_l_in;
+ tmp0_l += q0_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
+ tmp0_h = q7_h_in - p0_h_in;
+ tmp0_h += q0_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 7);
+
+ /* q1 */
+ filter8 = __lsx_vld(filter48, 64);
+ tmp0_l = q7_l_in - q0_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p6_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q0_h_in;
+ tmp0_h += q1_h_in;
+ tmp0_h -= p6_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 8);
+
+ /* q2 */
+ filter8 = __lsx_vld(filter48, 80);
+ tmp0_l = q7_l_in - q1_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p5_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q1_h_in;
+ tmp0_h += q2_h_in;
+ tmp0_h -= p5_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 9);
+
+ /* q3 */
+ tmp0_l = q7_l_in - q2_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p4_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q2_h_in;
+ tmp0_h += q3_h_in;
+ tmp0_h -= p4_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+ __lsx_vst(q3, dst, 16 * 10);
+
+ /* q4 */
+ tmp0_l = q7_l_in - q3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p3_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q3_h_in;
+ tmp0_h += q4_h_in;
+ tmp0_h -= p3_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+ __lsx_vst(q4, dst, 16 * 11);
+
+ /* q5 */
+ tmp0_l = q7_l_in - q4_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p2_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q4_h_in;
+ tmp0_h += q5_h_in;
+ tmp0_h -= p2_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+ __lsx_vst(q5, dst, 16 * 12);
+
+ /* q6 */
+ tmp0_l = q7_l_in - q5_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p1_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q5_h_in;
+ tmp0_h += q6_h_in;
+ tmp0_h -= p1_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+ __lsx_vst(q6, dst, 16 * 13);
+
+ return 0;
+}
+
+void vpx_lpf_vertical_16_dual_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ uint8_t early_exit = 0;
+ DECLARE_ALIGNED(16, uint8_t, transposed_input[16 * 24]);
+ uint8_t *filter48 = &transposed_input[16 * 16];
+
+ transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+ early_exit =
+ vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+ pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+ if (early_exit == 0) {
+ early_exit =
+ vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
+
+ if (early_exit == 0) {
+ transpose_16x16(transposed_input, 16, (src - 8), pitch);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c
new file mode 100644
index 0000000000..9300b5c5ae
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ __m128i mask, hev, flat, thresh, b_limit, limit;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+ int32_t pitch2 = pitch << 1;
+ int32_t pitch3 = pitch2 + pitch;
+ int32_t pitch4 = pitch2 << 1;
+
+ DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+ p3, p2, p1, p0);
+ q0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+ q3 = __lsx_vldx(src, pitch3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ __lsx_vstelm_d(p1_out, src - pitch2, 0, 0);
+ __lsx_vstelm_d(p0_out, src - pitch, 0, 0);
+ __lsx_vstelm_d(q0_out, src, 0, 0);
+ __lsx_vstelm_d(q1_out, src + pitch, 0, 0);
+}
+
+void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ __m128i mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ int32_t pitch2 = pitch << 1;
+ int32_t pitch3 = pitch2 + pitch;
+ int32_t pitch4 = pitch2 << 1;
+
+ DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+ p3, p2, p1, p0);
+ q0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+ q3 = __lsx_vldx(src, pitch3);
+
+ thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+ thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+ thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+ b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+ b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+ b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+ limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+ limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+ limit0 = __lsx_vilvl_d(limit1, limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+ __lsx_vstx(p1, src, -pitch2);
+ __lsx_vstx(p0, src, -pitch);
+ __lsx_vst(q0, src, 0);
+ __lsx_vstx(q1, src, pitch);
+}
+
+void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ __m128i mask, hev, flat, limit, thresh, b_limit;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i vec0, vec1, vec2, vec3;
+ int32_t pitch2 = pitch << 1;
+ int32_t pitch3 = pitch2 + pitch;
+ int32_t pitch4 = pitch2 << 1;
+ uint8_t *src_tmp = src - 4;
+
+ p3 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, p2, p1);
+ p0 = __lsx_vldx(src_tmp, pitch3);
+ src_tmp += pitch4;
+ q0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2);
+ q3 = __lsx_vldx(src_tmp, pitch3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+ DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1);
+ vec2 = __lsx_vilvl_h(vec1, vec0);
+ vec3 = __lsx_vilvh_h(vec1, vec0);
+
+ src -= 2;
+ __lsx_vstelm_w(vec2, src, 0, 0);
+ src += pitch;
+ __lsx_vstelm_w(vec2, src, 0, 1);
+ src += pitch;
+ __lsx_vstelm_w(vec2, src, 0, 2);
+ src += pitch;
+ __lsx_vstelm_w(vec2, src, 0, 3);
+ src += pitch;
+
+ __lsx_vstelm_w(vec3, src, 0, 0);
+ __lsx_vstelm_w(vec3, src + pitch, 0, 1);
+ __lsx_vstelm_w(vec3, src + pitch2, 0, 2);
+ __lsx_vstelm_w(vec3, src + pitch3, 0, 3);
+}
+
+void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ __m128i mask, hev, flat;
+ __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+ __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ int32_t pitch2 = pitch << 1;
+ int32_t pitch3 = pitch2 + pitch;
+ int32_t pitch4 = pitch2 << 1;
+ uint8_t *src_tmp = src - 4;
+
+ row0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row1, row2);
+ row3 = __lsx_vldx(src_tmp, pitch3);
+ src_tmp += pitch4;
+ row4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row5, row6);
+ row7 = __lsx_vldx(src_tmp, pitch3);
+ src_tmp += pitch4;
+ row8 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row9, row10);
+ row11 = __lsx_vldx(src_tmp, pitch3);
+ src_tmp += pitch4;
+ row12 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row13, row14);
+ row15 = __lsx_vldx(src_tmp, pitch3);
+
+ LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+ thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+ thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+ b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+ b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+ b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+ limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+ limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+ limit0 = __lsx_vilvl_d(limit1, limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+ DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+ tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+ DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+ tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+ src -= 2;
+ __lsx_vstelm_w(tmp2, src, 0, 0);
+ __lsx_vstelm_w(tmp2, src + pitch, 0, 1);
+ __lsx_vstelm_w(tmp2, src + pitch2, 0, 2);
+ __lsx_vstelm_w(tmp2, src + pitch3, 0, 3);
+ src += pitch4;
+ __lsx_vstelm_w(tmp3, src, 0, 0);
+ __lsx_vstelm_w(tmp3, src + pitch, 0, 1);
+ __lsx_vstelm_w(tmp3, src + pitch2, 0, 2);
+ __lsx_vstelm_w(tmp3, src + pitch3, 0, 3);
+ src += pitch4;
+ __lsx_vstelm_w(tmp4, src, 0, 0);
+ __lsx_vstelm_w(tmp4, src + pitch, 0, 1);
+ __lsx_vstelm_w(tmp4, src + pitch2, 0, 2);
+ __lsx_vstelm_w(tmp4, src + pitch3, 0, 3);
+ src += pitch4;
+ __lsx_vstelm_w(tmp5, src, 0, 0);
+ __lsx_vstelm_w(tmp5, src + pitch, 0, 1);
+ __lsx_vstelm_w(tmp5, src + pitch2, 0, 2);
+ __lsx_vstelm_w(tmp5, src + pitch3, 0, 3);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c
new file mode 100644
index 0000000000..00219ba71d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ __m128i mask, hev, flat, thresh, b_limit, limit;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p2_out, p1_out, p0_out, q0_out, q1_out;
+ __m128i p2_filter8, p1_filter8, p0_filter8;
+ __m128i q0_filter8, q1_filter8, q2_filter8;
+ __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+
+ /* load vector elements */
+ DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+ -stride, p3, p2, p1, p0);
+ q0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+ q3 = __lsx_vldx(dst, stride3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ flat = __lsx_vilvl_d(flat, flat);
+
+ if (__lsx_bz_v(flat)) {
+ __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+ __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+ __lsx_vstelm_d(q0_out, dst, 0, 0);
+ __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+ } else {
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+ p0_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+ q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+ p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+ DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8,
+ p1_filter8, q0_filter8);
+ q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8);
+
+ p2 = __lsx_vilvl_d(p1_out, p2);
+ p0_out = __lsx_vilvl_d(q0_out, p0_out);
+ q1_out = __lsx_vilvl_d(q2, q1_out);
+
+ DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat,
+ p2_out, p1_out);
+ p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat);
+ dst -= stride3;
+
+ __lsx_vstelm_d(p2_out, dst, 0, 0);
+ __lsx_vstelm_d(p2_out, dst + stride, 0, 1);
+ __lsx_vstelm_d(p1_out, dst + stride2, 0, 0);
+ __lsx_vstelm_d(p1_out, dst + stride3, 0, 1);
+
+ dst += stride4;
+ __lsx_vstelm_d(p0_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p0_out, dst, 0, 1);
+ }
+}
+
+void vpx_lpf_horizontal_8_dual_lsx(
+ uint8_t *dst, int32_t stride, const uint8_t *b_limit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ __m128i flat, mask, hev, thresh, b_limit, limit;
+ __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+ __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+ __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+ __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+
+ DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+ -stride, p3, p2, p1, p0);
+ q0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+ q3 = __lsx_vldx(dst, stride3);
+
+ thresh = __lsx_vldrepl_b(thresh0, 0);
+ p2_out = __lsx_vldrepl_b(thresh1, 0);
+ thresh = __lsx_vilvl_d(p2_out, thresh);
+
+ b_limit = __lsx_vldrepl_b(b_limit0, 0);
+ p2_out = __lsx_vldrepl_b(b_limit1, 0);
+ b_limit = __lsx_vilvl_d(p2_out, b_limit);
+
+ limit = __lsx_vldrepl_b(limit0, 0);
+ p2_out = __lsx_vldrepl_b(limit1, 0);
+ limit = __lsx_vilvl_d(p2_out, limit);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__lsx_bz_v(flat)) {
+ __lsx_vst(p1_out, dst - stride2, 0);
+ __lsx_vst(p0_out, dst - stride, 0);
+ __lsx_vst(q0_out, dst, 0);
+ __lsx_vst(q1_out, dst + stride, 0);
+ } else {
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+ p0_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+ q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+ DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+ VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+ p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+ p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l);
+ DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+ q1_filt8_l, q2_filt8_l);
+
+ /* store pixel values */
+ p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+ p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+ p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+ q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+ q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+ q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+ __lsx_vst(p2_out, dst - stride3, 0);
+ __lsx_vst(p1_out, dst - stride2, 0);
+ __lsx_vst(p0_out, dst - stride, 0);
+ __lsx_vst(q0_out, dst, 0);
+ __lsx_vst(q1_out, dst + stride, 0);
+ __lsx_vst(q2_out, dst + stride2, 0);
+ }
+}
+
+void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p1_out, p0_out, q0_out, q1_out;
+ __m128i flat, mask, hev, thresh, b_limit, limit;
+ __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+ __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ __m128i zero = __lsx_vldi(0);
+
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+ uint8_t *dst_tmp = dst - 4;
+
+ /* load vector elements */
+ p3 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
+ p0 = __lsx_vldx(dst_tmp, stride3);
+ dst_tmp += stride4;
+ q0 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
+ q3 = __lsx_vldx(dst_tmp, stride3);
+
+ LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ flat = __lsx_vilvl_d(zero, flat);
+
+ /* if flat is zero for all pixels, then no need to calculate other filter */
+ if (__lsx_bz_v(flat)) {
+ /* Store 4 pixels p1-_q1 */
+ DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+ p2 = __lsx_vilvl_h(p1, p0);
+ p3 = __lsx_vilvh_h(p1, p0);
+
+ dst -= 2;
+ __lsx_vstelm_w(p2, dst, 0, 0);
+ __lsx_vstelm_w(p2, dst + stride, 0, 1);
+ __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+ __lsx_vstelm_w(p2, dst + stride3, 0, 3);
+ dst += stride4;
+ __lsx_vstelm_w(p3, dst, 0, 0);
+ __lsx_vstelm_w(p3, dst + stride, 0, 1);
+ __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+ __lsx_vstelm_w(p3, dst + stride3, 0, 3);
+ } else {
+ DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
+ p1_l, p0_l);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
+ q2_l, q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+ p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l);
+ DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+ q1_filt8_l, q2_filt8_l);
+ /* store pixel values */
+ p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+ p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+ p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+ q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+ q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+ q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+ /* Store 6 pixels p2-_q2 */
+ DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+ p1 = __lsx_vilvl_h(q3, p3);
+ p2 = __lsx_vilvh_h(q3, p3);
+ p3 = __lsx_vilvl_b(q2, q1);
+ dst -= 3;
+ __lsx_vstelm_w(p1, dst, 0, 0);
+ __lsx_vstelm_h(p3, dst, 4, 0);
+ dst += stride;
+ __lsx_vstelm_w(p1, dst, 0, 1);
+ __lsx_vstelm_h(p3, dst, 4, 1);
+ dst += stride;
+ __lsx_vstelm_w(p1, dst, 0, 2);
+ __lsx_vstelm_h(p3, dst, 4, 2);
+ dst += stride;
+ __lsx_vstelm_w(p1, dst, 0, 3);
+ __lsx_vstelm_h(p3, dst, 4, 3);
+ dst += stride;
+ __lsx_vstelm_w(p2, dst, 0, 0);
+ __lsx_vstelm_h(p3, dst, 4, 4);
+ dst += stride;
+ __lsx_vstelm_w(p2, dst, 0, 1);
+ __lsx_vstelm_h(p3, dst, 4, 5);
+ dst += stride;
+ __lsx_vstelm_w(p2, dst, 0, 2);
+ __lsx_vstelm_h(p3, dst, 4, 6);
+ dst += stride;
+ __lsx_vstelm_w(p2, dst, 0, 3);
+ __lsx_vstelm_h(p3, dst, 4, 7);
+ }
+}
+
+void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit0, const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *b_limit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8_t *dst_tmp = dst - 4;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p1_out, p0_out, q0_out, q1_out;
+ __m128i flat, mask, hev, thresh, b_limit, limit;
+ __m128i row4, row5, row6, row7, row12, row13, row14, row15;
+ __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+ __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+ __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+ __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+
+ p0 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
+ p3 = __lsx_vldx(dst_tmp, stride3);
+ dst_tmp += stride4;
+ row4 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
+ row7 = __lsx_vldx(dst_tmp, stride3);
+ dst_tmp += stride4;
+
+ q3 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
+ q0 = __lsx_vldx(dst_tmp, stride3);
+ dst_tmp += stride4;
+ row12 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
+ row15 = __lsx_vldx(dst_tmp, stride3);
+
+ /* transpose 16x8 matrix into 8x16 */
+ LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+ row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+
+ thresh = __lsx_vldrepl_b(thresh0, 0);
+ p1_out = __lsx_vldrepl_b(thresh1, 0);
+ thresh = __lsx_vilvl_d(p1_out, thresh);
+
+ b_limit = __lsx_vldrepl_b(b_limit0, 0);
+ p1_out = __lsx_vldrepl_b(b_limit1, 0);
+ b_limit = __lsx_vilvl_d(p1_out, b_limit);
+
+ limit = __lsx_vldrepl_b(limit0, 0);
+ p1_out = __lsx_vldrepl_b(limit1, 0);
+ limit = __lsx_vilvl_d(p1_out, limit);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+ /* if flat is zero for all pixels, then no need to calculate other filter */
+ if (__lsx_bz_v(flat)) {
+ DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+ p2 = __lsx_vilvl_h(p1, p0);
+ p3 = __lsx_vilvh_h(p1, p0);
+ DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+ q2 = __lsx_vilvl_h(p1, p0);
+ q3 = __lsx_vilvh_h(p1, p0);
+ dst -= 2;
+ __lsx_vstelm_w(p2, dst, 0, 0);
+ __lsx_vstelm_w(p2, dst + stride, 0, 1);
+ __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+ __lsx_vstelm_w(p2, dst + stride3, 0, 3);
+ dst += stride4;
+ __lsx_vstelm_w(p3, dst, 0, 0);
+ __lsx_vstelm_w(p3, dst + stride, 0, 1);
+ __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+ __lsx_vstelm_w(p3, dst + stride3, 0, 3);
+ dst += stride4;
+ __lsx_vstelm_w(q2, dst, 0, 0);
+ __lsx_vstelm_w(q2, dst + stride, 0, 1);
+ __lsx_vstelm_w(q2, dst + stride2, 0, 2);
+ __lsx_vstelm_w(q2, dst + stride3, 0, 3);
+ dst += stride4;
+ __lsx_vstelm_w(q3, dst, 0, 0);
+ __lsx_vstelm_w(q3, dst + stride, 0, 1);
+ __lsx_vstelm_w(q3, dst + stride2, 0, 2);
+ __lsx_vstelm_w(q3, dst + stride3, 0, 3);
+ } else {
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+ p0_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+ q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+ DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+ DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+
+ /* filter8 */
+ VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+ p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+ p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l);
+ DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+ q1_filt8_l, q2_filt8_l);
+
+ /* store pixel values */
+ p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+ p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+ p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+ q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+ q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+ q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+ DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+ p2_filt8_l = __lsx_vilvl_h(q3, p3);
+ p2_filt8_h = __lsx_vilvh_h(q3, p3);
+ DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3);
+ p0_filt8_l = __lsx_vilvl_h(q3, p3);
+ p0_filt8_h = __lsx_vilvh_h(q3, p3);
+ q1_filt8_l = __lsx_vilvl_b(q2, q1);
+ q1_filt8_h = __lsx_vilvh_b(q2, q1);
+
+ dst -= 3;
+ __lsx_vstelm_w(p2_filt8_l, dst, 0, 0);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 0);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_l, dst, 0, 1);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 1);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_l, dst, 0, 2);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 2);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_l, dst, 0, 3);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 3);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_h, dst, 0, 0);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 4);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_h, dst, 0, 1);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 5);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_h, dst, 0, 2);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 6);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_h, dst, 0, 3);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 7);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_l, dst, 0, 0);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 0);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_l, dst, 0, 1);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 1);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_l, dst, 0, 2);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 2);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_l, dst, 0, 3);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 3);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_h, dst, 0, 0);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 4);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_h, dst, 0, 1);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 5);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_h, dst, 0, 2);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 6);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_h, dst, 0, 3);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 7);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h
new file mode 100644
index 0000000000..1c43836503
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \
+ flat_out) \
+ do { \
+ __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
+ __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
+ \
+ /* absolute subtraction of pixel values */ \
+ p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in); \
+ p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in); \
+ p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in); \
+ q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in); \
+ q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in); \
+ q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in); \
+ p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in); \
+ p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in); \
+ \
+ /* calculation of hev */ \
+ flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m); \
+ hev_out = __lsx_vslt_bu(thresh_in, flat_out); \
+ \
+ /* calculation of mask */ \
+ p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m); \
+ p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1); \
+ p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m); \
+ mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m); \
+ mask_out = __lsx_vmax_bu(flat_out, mask_out); \
+ p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m); \
+ mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out); \
+ q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m); \
+ mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \
+ \
+ mask_out = __lsx_vslt_bu(limit_in, mask_out); \
+ mask_out = __lsx_vxori_b(mask_out, 0xff); \
+ } while (0)
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
+ do { \
+ __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0; \
+ __m128i flat4_tmp = __lsx_vldi(1); \
+ \
+ DUP4_ARG2(__lsx_vabsd_bu, p2_in, p0_in, q2_in, q0_in, p3_in, p0_in, q3_in, \
+ q0_in, p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0); \
+ p2_asub_p0 = __lsx_vmax_bu(p2_asub_p0, q2_asub_q0); \
+ flat_out = __lsx_vmax_bu(p2_asub_p0, flat_out); \
+ p3_asub_p0 = __lsx_vmax_bu(p3_asub_p0, q3_asub_q0); \
+ flat_out = __lsx_vmax_bu(p3_asub_p0, flat_out); \
+ \
+ flat_out = __lsx_vslt_bu(flat4_tmp, flat_out); \
+ flat_out = __lsx_vxori_b(flat_out, 0xff); \
+ flat_out = flat_out & (mask); \
+ } while (0)
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
+ q6_in, q7_in, flat_in, flat2_out) \
+ do { \
+ __m128i flat5_tmp = __lsx_vldi(1); \
+ __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0; \
+ __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0; \
+ DUP4_ARG2(__lsx_vabsd_bu, p4_in, p0_in, q4_in, q0_in, p5_in, p0_in, q5_in, \
+ q0_in, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0); \
+ DUP4_ARG2(__lsx_vabsd_bu, p6_in, p0_in, q6_in, q0_in, p7_in, p0_in, q7_in, \
+ q0_in, p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0); \
+ \
+ DUP2_ARG2(__lsx_vmax_bu, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0, \
+ p4_asub_p0, flat2_out); \
+ flat2_out = __lsx_vmax_bu(p4_asub_p0, flat2_out); \
+ p6_asub_p0 = __lsx_vmax_bu(p6_asub_p0, q6_asub_q0); \
+ flat2_out = __lsx_vmax_bu(p6_asub_p0, flat2_out); \
+ p7_asub_p0 = __lsx_vmax_bu(p7_asub_p0, q7_asub_q0); \
+ flat2_out = __lsx_vmax_bu(p7_asub_p0, flat2_out); \
+ flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out); \
+ flat2_out = __lsx_vxori_b(flat2_out, 0xff); \
+ flat2_out = flat2_out & flat_in; \
+ } while (0)
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \
+ p0_out, q0_out, q1_out) \
+ do { \
+ __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+ const __m128i cnst4b = __lsx_vldi(4); \
+ const __m128i cnst3b = __lsx_vldi(3); \
+ DUP4_ARG2(__lsx_vxori_b, p1_in, 0x80, p0_in, 0x80, q0_in, 0x80, q1_in, \
+ 0x80, p1_m, p0_m, q0_m, q1_m); \
+ filt = __lsx_vssub_b(p1_m, q1_m); \
+ filt &= hev; \
+ \
+ q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt &= mask; \
+ DUP2_ARG2(__lsx_vsadd_b, filt, cnst4b, filt, cnst3b, t1, t2); \
+ DUP2_ARG2(__lsx_vsrai_b, t1, 3, t2, 3, t1, t2); \
+ \
+ q0_m = __lsx_vssub_b(q0_m, t1); \
+ p0_m = __lsx_vsadd_b(p0_m, t2); \
+ DUP2_ARG2(__lsx_vxori_b, q0_m, 0x80, p0_m, 0x80, q0_out, p0_out); \
+ \
+ filt = __lsx_vsrari_b(t1, 1); \
+ hev = __lsx_vxori_b(hev, 0xff); \
+ filt &= hev; \
+ q1_m = __lsx_vssub_b(q1_m, filt); \
+ p1_m = __lsx_vsadd_b(p1_m, filt); \
+ DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out); \
+ } while (0)
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+ q1_filt8_out, q2_filt8_out) \
+ do { \
+ __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \
+ \
+ tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in); \
+ tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, p0_in); \
+ tmp_filt8_0 = __lsx_vslli_h(p3_in, 1); \
+ \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_2); \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, q0_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p3_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, p2_in); \
+ p2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p1_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q1_in); \
+ p1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = __lsx_vadd_h(q2_in, q1_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q0_in); \
+ tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, tmp_filt8_1); \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, p0_in); \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, p3_in); \
+ p0_filt8_out = __lsx_vsrari_h(tmp_filt8_0, 3); \
+ \
+ tmp_filt8_0 = __lsx_vadd_h(q2_in, q3_in); \
+ tmp_filt8_0 = __lsx_vadd_h(p0_in, tmp_filt8_0); \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \
+ tmp_filt8_1 = __lsx_vadd_h(q3_in, q3_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, tmp_filt8_0); \
+ q2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \
+ \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, q3_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, q0_in); \
+ q0_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = __lsx_vsub_h(tmp_filt8_0, p2_in); \
+ tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \
+ q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \
+ } while (0)
+
+#endif // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c
new file mode 100644
index 0000000000..77be0bb4fe
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs,
+ __m128i round, __m128i quant,
+ __m128i shift, __m128i cmp_mask) {
+ __m128i rounded, qcoeff;
+
+ rounded = __lsx_vsadd_h(coeff_abs, round);
+ qcoeff = __lsx_vmuh_h(rounded, quant);
+ qcoeff = __lsx_vadd_h(rounded, qcoeff);
+ qcoeff = __lsx_vmuh_h(qcoeff, shift);
+ qcoeff = __lsx_vsigncov_h(coeff, qcoeff);
+ qcoeff = __lsx_vand_v(qcoeff, cmp_mask);
+
+ return qcoeff;
+}
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+ int16_t *dqcoeff) {
+ __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant);
+ __lsx_vst(dqcoeff16, dqcoeff, 0);
+}
+
+static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
+ __m128i dequant,
+ int16_t *dqcoeff) {
+ // Un-sign to bias rounding like C.
+ __m128i low, high, dqcoeff32_0, dqcoeff32_1, res;
+ __m128i zero = __lsx_vldi(0);
+ __m128i coeff = __lsx_vabsd_h(qcoeff, zero);
+
+ const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero);
+ const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero);
+
+ low = __lsx_vmul_h(coeff, dequant);
+ high = __lsx_vmuh_h(coeff, dequant);
+ dqcoeff32_0 = __lsx_vilvl_h(high, low);
+ dqcoeff32_1 = __lsx_vilvh_h(high, low);
+
+ // "Divide" by 2.
+ dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1);
+ dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1);
+ dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0);
+ dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1);
+ res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0);
+ __lsx_vst(res, dqcoeff, 0);
+}
+
+static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
+ const int16_t *scan, int index,
+ __m128i zero) {
+ const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
+ const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero);
+ __m128i scan0 = __lsx_vld(scan + index, 0);
+ __m128i scan1 = __lsx_vld(scan + index + 8, 0);
+ __m128i eob0, eob1;
+
+ eob0 = __lsx_vandn_v(zero_coeff0, scan0);
+ eob1 = __lsx_vandn_v(zero_coeff1, scan1);
+ return __lsx_vmax_h(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+ __m128i eob_shuffled;
+ int16_t res_m;
+
+ eob_shuffled = __lsx_vshuf4i_w(eob, 0xe);
+ eob = __lsx_vmax_h(eob, eob_shuffled);
+ eob_shuffled = __lsx_vshuf4i_h(eob, 0xe);
+ eob = __lsx_vmax_h(eob, eob_shuffled);
+ eob_shuffled = __lsx_vshuf4i_h(eob, 0x1);
+ eob = __lsx_vmax_h(eob, eob_shuffled);
+ res_m = __lsx_vpickve2gr_h(eob, 1);
+
+ return res_m;
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ __m128i zero = __lsx_vldi(0);
+ int index = 16;
+
+ __m128i zbin, round, quant, dequant, quant_shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
+
+ (void)scan;
+
+ zbin = __lsx_vld(zbin_ptr, 0);
+ round = __lsx_vld(round_ptr, 0);
+ quant = __lsx_vld(quant_ptr, 0);
+ dequant = __lsx_vld(dequant_ptr, 0);
+ quant_shift = __lsx_vld(quant_shift_ptr, 0);
+ // Handle one DC and first 15 AC.
+ DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+ qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+ qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+ cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+ zbin = __lsx_vilvh_d(zbin, zbin);
+ cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+ qcoeff0 =
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ round = __lsx_vilvh_d(round, round);
+ quant = __lsx_vilvh_d(quant, quant);
+ quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+ qcoeff1 =
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+ __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+ __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = __lsx_vilvh_d(dequant, dequant);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = __lsx_vld(coeff_ptr + index, 0);
+ coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+ qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+ qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+ cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+ cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+ qcoeff0 =
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ qcoeff1 =
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+ __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+ __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
+ eob = __lsx_vmax_h(eob, eob0);
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m128i zero = __lsx_vldi(0);
+ int index;
+
+ __m128i zbin, round, quant, dequant, quant_shift;
+ __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1;
+ __m128i eob = zero, eob0;
+
+ (void)scan;
+ (void)n_coeffs;
+
+ zbin = __lsx_vld(zbin_ptr, 0);
+ zbin = __lsx_vsrari_h(zbin, 1);
+ round = __lsx_vld(round_ptr, 0);
+ round = __lsx_vsrari_h(round, 1);
+
+ quant = __lsx_vld(quant_ptr, 0);
+ dequant = __lsx_vld(dequant_ptr, 0);
+ quant_shift = __lsx_vld(quant_shift_ptr, 0);
+ quant_shift = __lsx_vslli_h(quant_shift, 1);
+ // Handle one DC and first 15 AC.
+ DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+ qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+ qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+ cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+ // remove DC from zbin
+ zbin = __lsx_vilvh_d(zbin, zbin);
+ cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+ qcoeff0 =
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ // remove DC in quant_shift, quant, quant_shift
+ round = __lsx_vilvh_d(round, round);
+ quant = __lsx_vilvh_d(quant, quant);
+ quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+ qcoeff1 =
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+ __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+ __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = __lsx_vilvh_d(dequant, dequant);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
+ eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
+ // AC only loop.
+ for (index = 16; index < 32 * 32; index += 16) {
+ coeff0 = __lsx_vld(coeff_ptr + index, 0);
+ coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+ qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+ qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+ cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+ cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+ qcoeff0 =
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ qcoeff1 =
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+ __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+ __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
+ dqcoeff_ptr + 8 + index);
+ eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
+ eob = __lsx_vmax_h(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c
new file mode 100644
index 0000000000..b6fbedb0d0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c
@@ -0,0 +1,717 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0,
+ __m128i ref1) {
+ __m128i diff0_m, diff1_m, sad_m0;
+ __m128i sad_m = __lsx_vldi(0);
+
+ diff0_m = __lsx_vabsd_bu(in0, ref0);
+ diff1_m = __lsx_vabsd_bu(in1, ref1);
+
+ sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m);
+ sad_m = __lsx_vadd_h(sad_m, sad_m0);
+ sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m);
+ sad_m = __lsx_vadd_h(sad_m, sad_m0);
+
+ return sad_m;
+}
+
+static INLINE uint32_t hadd_uw_u32(__m128i in) {
+ __m128i res0_m;
+ uint32_t sum_m;
+
+ res0_m = __lsx_vhaddw_du_wu(in, in);
+ res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m);
+ sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+ return sum_m;
+}
+
+static INLINE uint32_t hadd_uh_u32(__m128i in) {
+ __m128i res_m;
+ uint32_t sum_m;
+
+ res_m = __lsx_vhaddw_wu_hu(in, in);
+ sum_m = hadd_uw_u32(res_m);
+
+ return sum_m;
+}
+
+static INLINE int32_t hadd_sw_s32(__m128i in) {
+ __m128i res0_m;
+ int32_t sum_m;
+
+ res0_m = __lsx_vhaddw_d_w(in, in);
+ res0_m = __lsx_vhaddw_q_d(res0_m, res0_m);
+ sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+ return sum_m;
+}
+
+static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ uint32_t res;
+ __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
+ __m128i sad = __lsx_vldi(0);
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+ src += src_stride;
+ ref += ref_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1);
+ src += src_stride;
+ ref += ref_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2);
+ src += src_stride;
+ ref += ref_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3);
+ src += src_stride;
+ ref += ref_stride;
+ DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+ src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ }
+ res = hadd_uh_u32(sad);
+ return res;
+}
+
+static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt = (height >> 2);
+ uint32_t res;
+ __m128i src0, src1, ref0, ref1, sad_tmp;
+ __m128i sad = __lsx_vldi(0);
+ int32_t src_stride2 = src_stride << 1;
+ int32_t ref_stride2 = ref_stride << 1;
+
+ for (; ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+ src += src_stride2;
+ ref += ref_stride2;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+ src += src_stride2;
+ ref += ref_stride2;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ }
+
+ res = hadd_uh_u32(sad);
+ return res;
+}
+
+static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt = (height >> 2);
+ uint32_t res;
+ __m128i src0, src1, ref0, ref1;
+ __m128i sad_tmp;
+ __m128i sad = __lsx_vldi(0);
+
+ for (; ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ }
+ res = hadd_uh_u32(sad);
+ return res;
+}
+
+static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt = (height >> 1);
+ uint32_t sad = 0;
+ __m128i src0, src1, src2, src3;
+ __m128i ref0, ref1, ref2, ref3;
+ __m128i sad_tmp;
+ __m128i sad0 = __lsx_vldi(0);
+ __m128i sad1 = sad0;
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+ }
+
+ sad = hadd_uh_u32(sad0);
+ sad += hadd_uh_u32(sad1);
+
+ return sad;
+}
+
+static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ int32_t ht_cnt = (height >> 2);
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ __m128i src0, src1, src2, src3, sad_tmp;
+ __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+ __m128i sad0 = __lsx_vldi(0);
+ __m128i sad1 = sad0;
+ __m128i sad2 = sad0;
+ __m128i sad3 = sad0;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t ref_stride2 = ref_stride << 1;
+ int32_t ref_stride3 = ref_stride2 + ref_stride;
+ int32_t ref_stride4 = ref_stride2 << 1;
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (; ht_cnt--;) {
+ src0 = __lsx_vld(src_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1,
+ src2);
+ src3 = __lsx_vldx(src_ptr, src_stride3);
+ src_ptr += src_stride4;
+ ref0 = __lsx_vld(ref0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1,
+ ref2);
+ ref3 = __lsx_vldx(ref0_ptr, ref_stride3);
+ ref0_ptr += ref_stride4;
+ ref4 = __lsx_vld(ref1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5,
+ ref6);
+ ref7 = __lsx_vldx(ref1_ptr, ref_stride3);
+ ref1_ptr += ref_stride4;
+ ref8 = __lsx_vld(ref2_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9,
+ ref10);
+ ref11 = __lsx_vldx(ref2_ptr, ref_stride3);
+ ref2_ptr += ref_stride4;
+ ref12 = __lsx_vld(ref3_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13,
+ ref14);
+ ref15 = __lsx_vldx(ref3_ptr, ref_stride3);
+ ref3_ptr += ref_stride4;
+
+ DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1);
+ DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+ DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+ DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad3 = __lsx_vadd_h(sad3, sad_tmp);
+ }
+ sad_array[0] = hadd_uh_u32(sad0);
+ sad_array[1] = hadd_uh_u32(sad1);
+ sad_array[2] = hadd_uh_u32(sad2);
+ sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ int32_t ht_cnt = (height >> 1);
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp;
+ __m128i sad0 = __lsx_vldi(0);
+ __m128i sad1 = sad0;
+ __m128i sad2 = sad0;
+ __m128i sad3 = sad0;
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (; ht_cnt--;) {
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref0 = __lsx_vld(ref0_ptr, 0);
+ ref0_ptr += ref_stride;
+ ref1 = __lsx_vld(ref1_ptr, 0);
+ ref1_ptr += ref_stride;
+ ref2 = __lsx_vld(ref2_ptr, 0);
+ ref2_ptr += ref_stride;
+ ref3 = __lsx_vld(ref3_ptr, 0);
+ ref3_ptr += ref_stride;
+
+ diff = __lsx_vabsd_bu(src, ref0);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref1);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref2);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad2 = __lsx_vadd_h(sad2, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref3);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad3 = __lsx_vadd_h(sad3, sad_tmp);
+
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref0 = __lsx_vld(ref0_ptr, 0);
+ ref0_ptr += ref_stride;
+ ref1 = __lsx_vld(ref1_ptr, 0);
+ ref1_ptr += ref_stride;
+ ref2 = __lsx_vld(ref2_ptr, 0);
+ ref2_ptr += ref_stride;
+ ref3 = __lsx_vld(ref3_ptr, 0);
+ ref3_ptr += ref_stride;
+
+ diff = __lsx_vabsd_bu(src, ref0);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref1);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref2);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad2 = __lsx_vadd_h(sad2, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref3);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad3 = __lsx_vadd_h(sad3, sad_tmp);
+ }
+ sad_array[0] = hadd_uh_u32(sad0);
+ sad_array[1] = hadd_uh_u32(sad1);
+ sad_array[2] = hadd_uh_u32(sad2);
+ sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ int32_t ht_cnt = height;
+ __m128i src0, src1, ref0, ref1, sad_tmp;
+ __m128i sad0 = __lsx_vldi(0);
+ __m128i sad1 = sad0;
+ __m128i sad2 = sad0;
+ __m128i sad3 = sad0;
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (; ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+
+ DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1);
+ ref0_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1);
+ ref1_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1);
+ ref2_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1);
+ ref3_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad3 = __lsx_vadd_h(sad3, sad_tmp);
+ }
+ sad_array[0] = hadd_uh_u32(sad0);
+ sad_array[1] = hadd_uh_u32(sad1);
+ sad_array[2] = hadd_uh_u32(sad2);
+ sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ int32_t ht_cnt = height;
+ __m128i src0, src1, src2, src3;
+ __m128i ref0, ref1, ref2, ref3;
+ __m128i sad, sad_tmp;
+
+ __m128i sad0_0 = __lsx_vldi(0);
+ __m128i sad0_1 = sad0_0;
+ __m128i sad1_0 = sad0_0;
+ __m128i sad1_1 = sad0_0;
+ __m128i sad2_0 = sad0_0;
+ __m128i sad2_1 = sad0_0;
+ __m128i sad3_0 = sad0_0;
+ __m128i sad3_1 = sad0_0;
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref0_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref1_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref2_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref3_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp);
+ }
+ sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0);
+ sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1);
+ sad = __lsx_vadd_w(sad, sad_tmp);
+ sad_array[0] = hadd_uw_u32(sad);
+
+ sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0);
+ sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1);
+ sad = __lsx_vadd_w(sad, sad_tmp);
+ sad_array[1] = hadd_uw_u32(sad);
+
+ sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0);
+ sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1);
+ sad = __lsx_vadd_w(sad, sad_tmp);
+ sad_array[2] = hadd_uw_u32(sad);
+
+ sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0);
+ sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1);
+ sad = __lsx_vadd_w(sad, sad_tmp);
+ sad_array[3] = hadd_uw_u32(sad);
+}
+
+static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ __m128i comp0, comp1, sad_tmp;
+ __m128i sad = __lsx_vldi(0);
+ uint8_t *src_tmp, *ref_tmp;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t ref_stride2 = ref_stride << 1;
+ int32_t ref_stride3 = ref_stride2 + ref_stride;
+ int32_t ref_stride4 = ref_stride2 << 1;
+
+ for (; ht_cnt--;) {
+ src_tmp = (uint8_t *)src + 16;
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+ src1 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp, src_stride3);
+ src += src_stride4;
+
+ ref_tmp = (uint8_t *)ref + 16;
+ ref0 = __lsx_vld(ref, 0);
+ DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4);
+ ref6 = __lsx_vldx(ref, ref_stride3);
+ ref1 = __lsx_vld(ref_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3,
+ ref5);
+ ref7 = __lsx_vldx(ref_tmp, ref_stride3);
+ ref += ref_stride4;
+
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96,
+ pred0, pred2, pred4, pred6);
+ DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred,
+ 112, pred1, pred3, pred5, pred7);
+ sec_pred += 128;
+
+ DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ }
+ res = hadd_uh_u32(sad);
+ return res;
+}
+
+static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3;
+ __m128i sad, sad_tmp;
+ __m128i sad0 = __lsx_vldi(0);
+ __m128i sad1 = sad0;
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+ ref3, comp0, comp1, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+ ref3, comp0, comp1, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+ ref3, comp0, comp1, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+ ref3, comp0, comp1, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+ }
+ sad = __lsx_vhaddw_wu_hu(sad0, sad0);
+ sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1);
+ sad = __lsx_vadd_w(sad, sad_tmp);
+
+ res = hadd_sw_s32(sad);
+ return res;
+}
+
+#define VPX_SAD_8xHT_LSX(height) \
+ uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_8width_lsx(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_16xHT_LSX(height) \
+ uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_16width_lsx(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_32xHT_LSX(height) \
+ uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_32width_lsx(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_64xHT_LSX(height) \
+ uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_64width_lsx(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_8xHTx4D_LSX(height) \
+ void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
+ sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_SAD_16xHTx4D_LSX(height) \
+ void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_SAD_32xHTx4D_LSX(height) \
+ void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_SAD_64xHTx4D_LSX(height) \
+ void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_AVGSAD_32xHT_LSX(height) \
+ uint32_t vpx_sad32x##height##_avg_lsx( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define VPX_AVGSAD_64xHT_LSX(height) \
+ uint32_t vpx_sad64x##height##_avg_lsx( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define SAD64 \
+ VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \
+ VPX_AVGSAD_64xHT_LSX(64)
+
+SAD64
+
+#define SAD32 \
+ VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \
+ VPX_AVGSAD_32xHT_LSX(32)
+
+SAD32
+
+#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16)
+
+SAD16
+
+#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8)
+
+SAD8
+
+#undef SAD64
+#undef SAD32
+#undef SAD16
+#undef SAD8
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
new file mode 100644
index 0000000000..700793531c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
@@ -0,0 +1,874 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/loongarch/variance_lsx.h"
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters_lsx[8][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+#define VARIANCE_WxH(sse, diff, shift) \
+ (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+ (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t *diff) {
+ int32_t res, ht_cnt = 32;
+ __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ __m128i pred0, pred1, pred2, pred3, vec, vec_tmp;
+ __m128i avg0, avg1, avg2, avg3;
+ __m128i var = __lsx_vldi(0);
+
+ avg0 = var;
+ avg1 = var;
+ avg2 = var;
+ avg3 = var;
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ src_ptr += src_stride;
+ DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+
+ DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+ pred3, src0, src1, src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ src_ptr += src_stride;
+ DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+
+ DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+ pred3, src0, src1, src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ }
+ vec = __lsx_vhaddw_w_h(avg0, avg0);
+ vec_tmp = __lsx_vhaddw_w_h(avg1, avg1);
+ vec = __lsx_vadd_w(vec, vec_tmp);
+ vec_tmp = __lsx_vhaddw_w_h(avg2, avg2);
+ vec = __lsx_vadd_w(vec, vec_tmp);
+ vec_tmp = __lsx_vhaddw_w_h(avg3, avg3);
+ vec = __lsx_vadd_w(vec, vec_tmp);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ __m128i vec0, vec1, vec2, vec3, filt0, out, vec;
+ __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ ref0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+ ref3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+
+ DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, vec0, vec0, FILTER_BITS, vec1, vec1,
+ FILTER_BITS, vec2, vec2, FILTER_BITS, vec3, vec3, FILTER_BITS,
+ src0, src1, src2, src3);
+ out = __lsx_vpackev_d(src1, src0);
+ CALC_MSE_AVG_B(out, ref0, var, avg);
+ out = __lsx_vpackev_d(src3, src2);
+ CALC_MSE_AVG_B(out, ref1, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i dst0, dst1, dst2, dst3, filt0;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i vec, var = __lsx_vldi(0);
+ __m128i avg = var;
+ __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+ src += src_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ src0, src1, src2, src3);
+ CALC_MSE_AVG_B(src0, dst0, var, avg);
+ CALC_MSE_AVG_B(src1, dst1, var, avg);
+ CALC_MSE_AVG_B(src2, dst2, var, avg);
+ CALC_MSE_AVG_B(src3, dst3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t sse = 0;
+ int32_t diff0[2];
+
+ sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[0]);
+ src += 16;
+ dst += 16;
+
+ sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[1]);
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+ __m128i vec, vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3, filt0;
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ ref0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+ ref3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+
+ DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+ vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ src0 = src4;
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+ __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i var = __lsx_vldi(0);
+ __m128i avg = var;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ ref0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+ ref3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ src0 = src4;
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t sse = 0;
+ int32_t diff0[2];
+
+ sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[0]);
+ src += 16;
+ dst += 16;
+
+ sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[1]);
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4, out0, out1;
+ __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3, vec, vec0, filt_hz, filt_vt;
+ __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+ HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src1, ref0);
+ src += src_stride;
+ dst += dst_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src2, ref1);
+ src += src_stride;
+ dst += dst_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src3, ref2);
+ src += src_stride;
+ dst += dst_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src4, ref3);
+ src += src_stride;
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+ HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out1);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out1);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out0);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, out0, out1);
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec;
+ __m128i var = __lsx_vldi(0);
+ __m128i avg = var;
+ __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+
+ HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+ src += src_stride;
+
+ ref0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+ ref3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+
+ HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+ HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ src0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ src1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+ HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ src2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ src3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ CALC_MSE_AVG_B(src2, ref2, var, avg);
+ CALC_MSE_AVG_B(src3, ref3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ uint32_t sse = 0;
+ int32_t diff0[2];
+
+ sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height,
+ &diff0[0]);
+ src += 16;
+ dst += 16;
+
+ sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height,
+ &diff0[1]);
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff, int32_t width) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+ __m128i pred0, pred1, pred2, pred3, filt0, vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i mask = { 0x403030202010100, 0x807070606050504 };
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+ src += src_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ dst1 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ dst2 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ dst3 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+
+ pred0 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred1 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred2 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred3 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vavgr_bu, tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3,
+ pred3, tmp0, tmp1, tmp2, tmp3);
+
+ CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+ CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+ CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+ CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+
+ return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff, int32_t width) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+ __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i tmp0, tmp1, vec, filt0;
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ src += src_stride;
+ src2 = __lsx_vld(src, 0);
+ src += src_stride;
+ src3 = __lsx_vld(src, 0);
+ src += src_stride;
+ src4 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ pred0 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred1 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred2 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred3 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ src0 = src4;
+ ref0 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref1 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref2 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref3 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+ pred3, out0, out1, out2, out3);
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+ __m128i out0, out1, out2, out3, filt_hz, filt_vt, vec, vec0, vec1;
+ __m128i mask = { 0x403030202010100, 0x807070606050504 };
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+
+ HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+ src += src_stride;
+
+ pred0 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred1 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred2 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred3 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+
+ HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+ HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+ HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ ref0 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref1 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref2 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref3 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+ pred3, out0, out1, out2, out3);
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_h_lsx(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_v_lsx(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += subpel_avg_ssediff_16w_hv_lsx(src, src_stride, dst, dst_stride,
+ sec_pred, filter_horiz, filter_vert,
+ height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
+
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht) \
+ uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx( \
+ const uint8_t *src, int32_t src_stride, int32_t x_offset, \
+ int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sse) { \
+ int32_t diff; \
+ uint32_t var; \
+ const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \
+ const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \
+ \
+ if (y_offset) { \
+ if (x_offset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_hv_lsx( \
+ src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_sse_diff_##wd##width_v_lsx( \
+ src, src_stride, ref, ref_stride, v_filter, ht, &diff); \
+ } \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ if (x_offset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_h_lsx( \
+ src, src_stride, ref, ref_stride, h_filter, ht, &diff); \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ var = vpx_variance##wd##x##ht##_lsx(src, src_stride, ref, ref_stride, \
+ sse); \
+ } \
+ } \
+ \
+ return var; \
+ }
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(8, 8)
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(16, 16)
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32)
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht) \
+ uint32_t vpx_sub_pixel_avg_variance64x##ht##_lsx( \
+ const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \
+ int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \
+ uint32_t *sse, const uint8_t *sec_pred) { \
+ int32_t diff; \
+ const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \
+ const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \
+ \
+ if (y_offset) { \
+ if (x_offset) { \
+ *sse = sub_pixel_avg_sse_diff_64width_hv_lsx( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
+ v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_avg_sse_diff_64width_v_lsx( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+ &diff); \
+ } \
+ } else { \
+ if (x_offset) { \
+ *sse = sub_pixel_avg_sse_diff_64width_h_lsx( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+ &diff); \
+ } else { \
+ *sse = avg_sse_diff_64x##ht##_lsx(src_ptr, src_stride, ref_ptr, \
+ ref_stride, sec_pred, &diff); \
+ } \
+ } \
+ \
+ return VARIANCE_64Wx##ht##H(*sse, diff); \
+ }
+
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(64)
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c
new file mode 100644
index 0000000000..943a5c5a9b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ __m128i src0, src1, src2, src3;
+ __m128i pred0, pred1, pred2, pred3;
+ __m128i diff0, diff1;
+ __m128i reg0, reg1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t pred_stride2 = pred_stride << 1;
+ int32_t diff_stride2 = diff_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t pred_stride3 = pred_stride2 + pred_stride;
+ int32_t diff_stride3 = diff_stride2 + diff_stride;
+
+ DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0,
+ src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+ src2, src3);
+ DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0,
+ pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+ pred1, pred2, pred3);
+ DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2,
+ src0, src2, pred0, pred2);
+ DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0);
+ reg0 = __lsx_vilvl_b(src0, pred0);
+ reg1 = __lsx_vilvh_b(src0, pred0);
+ DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1);
+ __lsx_vstelm_d(diff0, diff_ptr, 0, 0);
+ __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1);
+ __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0);
+ __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1);
+}
+
+static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t pred_stride2 = pred_stride << 1;
+ int32_t dst_stride = diff_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t pred_stride3 = pred_stride2 + pred_stride;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t pred_stride4 = pred_stride2 << 1;
+ int32_t dst_stride3 = dst_stride + dst_stride2;
+
+ DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+ src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+ src2, src3);
+ DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+ pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+ pred1, pred2, pred3);
+ src_ptr += src_stride4;
+ pred_ptr += pred_stride4;
+
+ DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+ src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5,
+ src6, src7);
+ DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+ pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4,
+ pred5, pred6, pred7);
+
+ DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+ src4, src5, src6, src7);
+ __lsx_vst(src0, diff_ptr, 0);
+ __lsx_vstx(src1, diff_ptr, dst_stride);
+ __lsx_vstx(src2, diff_ptr, dst_stride2);
+ __lsx_vstx(src3, diff_ptr, dst_stride3);
+ diff_ptr += dst_stride2;
+ __lsx_vst(src4, diff_ptr, 0);
+ __lsx_vstx(src5, diff_ptr, dst_stride);
+ __lsx_vstx(src6, diff_ptr, dst_stride2);
+ __lsx_vstx(src7, diff_ptr, dst_stride3);
+}
+
+static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t pred_stride2 = pred_stride << 1;
+ int32_t dst_stride = diff_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t pred_stride3 = pred_stride2 + pred_stride;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t pred_stride4 = pred_stride2 << 1;
+ int32_t dst_stride3 = dst_stride + dst_stride2;
+ int16_t *diff_tmp = diff + 8;
+
+ DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+ pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+ src += src_stride4;
+ pred += pred_stride4;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ pred, pred_stride, src5, src6, src7, pred5);
+ DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+ src += src_stride4;
+ pred += pred_stride4;
+ DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg0, reg2, reg4, reg6);
+ DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg1, reg3, reg5, reg7);
+ DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+ src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+ pred0, pred1, pred2, pred3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+ pred4, pred5, pred6, pred7);
+ __lsx_vst(src0, diff, 0);
+ __lsx_vstx(src2, diff, dst_stride);
+ __lsx_vstx(src4, diff, dst_stride2);
+ __lsx_vstx(src6, diff, dst_stride3);
+ __lsx_vst(src1, diff_tmp, 0);
+ __lsx_vstx(src3, diff_tmp, dst_stride);
+ __lsx_vstx(src5, diff_tmp, dst_stride2);
+ __lsx_vstx(src7, diff_tmp, dst_stride3);
+ diff += dst_stride2;
+ diff_tmp += dst_stride2;
+ __lsx_vst(pred0, diff, 0);
+ __lsx_vstx(pred2, diff, dst_stride);
+ __lsx_vstx(pred4, diff, dst_stride2);
+ __lsx_vstx(pred6, diff, dst_stride3);
+ __lsx_vst(pred1, diff_tmp, 0);
+ __lsx_vstx(pred3, diff_tmp, dst_stride);
+ __lsx_vstx(pred5, diff_tmp, dst_stride2);
+ __lsx_vstx(pred7, diff_tmp, dst_stride3);
+ diff += dst_stride2;
+ diff_tmp += dst_stride2;
+ DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+ pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+ src += src_stride4;
+ pred += pred_stride4;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ pred, pred_stride, src5, src6, src7, pred5);
+ DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+ DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg0, reg2, reg4, reg6);
+ DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg1, reg3, reg5, reg7);
+ DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+ src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+ pred0, pred1, pred2, pred3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+ pred4, pred5, pred6, pred7);
+ __lsx_vst(src0, diff, 0);
+ __lsx_vstx(src2, diff, dst_stride);
+ __lsx_vstx(src4, diff, dst_stride2);
+ __lsx_vstx(src6, diff, dst_stride3);
+ __lsx_vst(src1, diff_tmp, 0);
+ __lsx_vstx(src3, diff_tmp, dst_stride);
+ __lsx_vstx(src5, diff_tmp, dst_stride2);
+ __lsx_vstx(src7, diff_tmp, dst_stride3);
+ diff += dst_stride2;
+ diff_tmp += dst_stride2;
+ __lsx_vst(pred0, diff, 0);
+ __lsx_vstx(pred2, diff, dst_stride);
+ __lsx_vstx(pred4, diff, dst_stride2);
+ __lsx_vstx(pred6, diff, dst_stride3);
+ __lsx_vst(pred1, diff_tmp, 0);
+ __lsx_vstx(pred3, diff_tmp, dst_stride);
+ __lsx_vstx(pred5, diff_tmp, dst_stride2);
+ __lsx_vstx(pred7, diff_tmp, dst_stride3);
+}
+
+static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ uint32_t loop_cnt;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t pred_stride2 = pred_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t pred_stride3 = pred_stride2 + pred_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t pred_stride4 = pred_stride2 << 1;
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ const uint8_t *src_tmp = src + 16;
+ const uint8_t *pred_tmp = pred + 16;
+ DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1,
+ pred0, pred1);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+ src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+ DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred,
+ pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3);
+ DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred,
+ pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7);
+ DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg0, reg2, reg4, reg6);
+ DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg1, reg3, reg5, reg7);
+ DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+ reg3, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+ reg7, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+ tmp3, pred0, pred1, pred2, pred3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+ tmp7, pred4, pred5, pred6, pred7);
+ src += src_stride4;
+ pred += pred_stride4;
+ __lsx_vst(src0, diff, 0);
+ __lsx_vst(src1, diff, 16);
+ __lsx_vst(src2, diff, 32);
+ __lsx_vst(src3, diff, 48);
+ diff += diff_stride;
+ __lsx_vst(src4, diff, 0);
+ __lsx_vst(src5, diff, 16);
+ __lsx_vst(src6, diff, 32);
+ __lsx_vst(src7, diff, 48);
+ diff += diff_stride;
+ __lsx_vst(pred0, diff, 0);
+ __lsx_vst(pred1, diff, 16);
+ __lsx_vst(pred2, diff, 32);
+ __lsx_vst(pred3, diff, 48);
+ diff += diff_stride;
+ __lsx_vst(pred4, diff, 0);
+ __lsx_vst(pred5, diff, 16);
+ __lsx_vst(pred6, diff, 32);
+ __lsx_vst(pred7, diff, 48);
+ diff += diff_stride;
+ }
+}
+
+static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ uint32_t loop_cnt;
+
+ for (loop_cnt = 32; loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1,
+ pred2, pred3);
+ src += src_stride;
+ pred += pred_stride;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+ src7);
+ DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5,
+ pred6, pred7);
+ src += src_stride;
+ pred += pred_stride;
+
+ DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg0, reg2, reg4, reg6);
+ DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg1, reg3, reg5, reg7);
+ DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+ reg3, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+ reg7, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+ tmp3, pred0, pred1, pred2, pred3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+ tmp7, pred4, pred5, pred6, pred7);
+ __lsx_vst(src0, diff, 0);
+ __lsx_vst(src1, diff, 16);
+ __lsx_vst(src2, diff, 32);
+ __lsx_vst(src3, diff, 48);
+ __lsx_vst(src4, diff, 64);
+ __lsx_vst(src5, diff, 80);
+ __lsx_vst(src6, diff, 96);
+ __lsx_vst(src7, diff, 112);
+ diff += diff_stride;
+ __lsx_vst(pred0, diff, 0);
+ __lsx_vst(pred1, diff, 16);
+ __lsx_vst(pred2, diff, 32);
+ __lsx_vst(pred3, diff, 48);
+ __lsx_vst(pred4, diff, 64);
+ __lsx_vst(pred5, diff, 80);
+ __lsx_vst(pred6, diff, 96);
+ __lsx_vst(pred7, diff, 112);
+ diff += diff_stride;
+ }
+}
+
+void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ if (rows == cols) {
+ switch (rows) {
+ case 4:
+ sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 8:
+ sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 16:
+ sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 32:
+ sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 64:
+ sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ default:
+ vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+ } else {
+ vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h
new file mode 100644
index 0000000000..bd514831bf
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
+ do { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \
+ __m128i k0_m, k1_m, k2_m, k3_m; \
+ \
+ k0_m = __lsx_vreplgr2vr_h(cnst0); \
+ k1_m = __lsx_vreplgr2vr_h(cnst1); \
+ k2_m = __lsx_vpackev_h(k1_m, k0_m); \
+ \
+ DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m); \
+ DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m); \
+ \
+ DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \
+ k3_m = __lsx_vmulwod_w_h(s5_m, k1_m); \
+ s1_m = __lsx_vsub_w(s1_m, k3_m); \
+ k3_m = __lsx_vmulwod_w_h(s4_m, k1_m); \
+ s0_m = __lsx_vsub_w(s0_m, k3_m); \
+ \
+ out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \
+ \
+ DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m); \
+ out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \
+ } while (0)
+
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3) \
+ do { \
+ __m128i tp0_m, tp1_m; \
+ \
+ DUP2_ARG2(__lsx_vdp2_w_h, in0, in2, in1, in2, tp1_m, tp0_m); \
+ in3 = __lsx_vssrarni_h_w(tp1_m, tp0_m, DCT_CONST_BITS); \
+ } while (0)
+
+#endif // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c
new file mode 100644
index 0000000000..8fad342c71
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/variance_lsx.h"
+
+#define VARIANCE_WxH(sse, diff, shift) \
+ (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+ (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec;
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t ref_stride2 = ref_stride << 1;
+ int32_t ref_stride3 = ref_stride2 + ref_stride;
+ int32_t ref_stride4 = ref_stride2 << 1;
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr + src_stride, 0,
+ src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+ src2, src3);
+ src_ptr += src_stride4;
+ DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr + ref_stride, 0,
+ ref_ptr + ref_stride2, 0, ref_ptr + ref_stride3, 0, ref0, ref1,
+ ref2, ref3);
+ ref_ptr += ref_stride4;
+
+ DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+ src0, src1, ref0, ref1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i src, ref, vec;
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+
+ for (; ht_cnt--;) {
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i avg = __lsx_vldi(0);
+ __m128i src0, src1, ref0, ref1;
+ __m128i vec;
+ __m128i var = avg;
+
+ for (; ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t *diff) {
+ int32_t res, ht_cnt = 32;
+ __m128i avg0 = __lsx_vldi(0);
+ __m128i src0, src1, src2, src3;
+ __m128i ref0, ref1, ref2, ref3;
+ __m128i vec0, vec1;
+ __m128i avg1 = avg0;
+ __m128i avg2 = avg0;
+ __m128i avg3 = avg0;
+ __m128i var = avg0;
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ src_ptr += src_stride;
+ DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ src_ptr += src_stride;
+ DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ }
+ vec0 = __lsx_vhaddw_w_h(avg0, avg0);
+ vec1 = __lsx_vhaddw_w_h(avg1, avg1);
+ vec0 = __lsx_vadd_w(vec0, vec1);
+ vec1 = __lsx_vhaddw_w_h(avg2, avg2);
+ vec0 = __lsx_vadd_w(vec0, vec1);
+ vec1 = __lsx_vhaddw_w_h(avg3, avg3);
+ vec0 = __lsx_vadd_w(vec0, vec1);
+ HADD_SW_S32(vec0, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
+
+#define VPX_VARIANCE_WDXHT_LSX(wd, ht) \
+ uint32_t vpx_variance##wd##x##ht##_lsx( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, uint32_t *sse) { \
+ int32_t diff; \
+ \
+ *sse = \
+ sse_diff_##wd##width_lsx(src, src_stride, ref, ref_stride, ht, &diff); \
+ \
+ return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ }
+
+static uint32_t sse_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i src, ref;
+ __m128i var = __lsx_vldi(0);
+
+ for (; ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+ }
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+VPX_VARIANCE_WDXHT_LSX(8, 8)
+VPX_VARIANCE_WDXHT_LSX(16, 16)
+VPX_VARIANCE_WDXHT_LSX(32, 32)
+
+uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ int32_t diff;
+
+ *sse = sse_diff_64x64_lsx(src, src_stride, ref, ref_stride, &diff);
+
+ return VARIANCE_64Wx64H(*sse, diff);
+}
+
+uint32_t vpx_mse16x16_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ *sse = sse_16width_lsx(src, src_stride, ref, ref_stride, 16);
+
+ return *sse;
+}
+
+void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+ int32_t *sum) {
+ *sse = sse_diff_16width_lsx(src, src_stride, ref, ref_stride, 16, sum);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h
new file mode 100644
index 0000000000..cf9e9890ff
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define HADD_SW_S32(in0, in1) \
+ do { \
+ __m128i res0_m; \
+ \
+ res0_m = __lsx_vhaddw_d_w(in0, in0); \
+ res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
+ in1 = __lsx_vpickve2gr_w(res0_m, 0); \
+ } while (0)
+
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \
+ do { \
+ __m128i tmp0_m, tmp1_m; \
+ \
+ tmp0_m = __lsx_vshuf_b(in1, in0, mask); \
+ tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \
+ in2 = __lsx_vsrari_h(tmp1_m, shift); \
+ } while (0)
+
+#define CALC_MSE_B(src, ref, var) \
+ do { \
+ __m128i src_l0_m, src_l1_m; \
+ __m128i res_l0_m, res_l1_m; \
+ \
+ src_l0_m = __lsx_vilvl_b(src, ref); \
+ src_l1_m = __lsx_vilvh_b(src, ref); \
+ DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+ res_l0_m, res_l1_m); \
+ var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \
+ var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \
+ } while (0)
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) \
+ do { \
+ __m128i src_l0_m, src_l1_m; \
+ __m128i res_l0_m, res_l1_m; \
+ \
+ src_l0_m = __lsx_vilvl_b(src, ref); \
+ src_l1_m = __lsx_vilvh_b(src, ref); \
+ DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+ res_l0_m, res_l1_m); \
+ var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \
+ var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \
+ sub = __lsx_vadd_h(sub, res_l0_m); \
+ sub = __lsx_vadd_h(sub, res_l1_m); \
+ } while (0)
+
+#endif // VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
new file mode 100644
index 0000000000..1c59228813
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
@@ -0,0 +1,972 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1;
+ __m128i dst0, dst1, dst2, dst3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, tmp0, tmp1);
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst0 = __lsx_vilvl_w(dst1, dst0);
+ dst1 = __lsx_vilvl_w(dst3, dst2);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+ tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+ tmp0 = __lsx_vxori_b(tmp0, 128);
+ dst0 = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+ tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+ dst0 = __lsx_vilvl_d(tmp1, tmp0);
+
+ tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+ tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+ tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+ dst1 = __lsx_vilvl_d(tmp1, tmp0);
+
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, tmp0, tmp1);
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7,
+ tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 3);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst1, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst1, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (height == 4) {
+ common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ int32_t loop_cnt = height >> 2;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1, dst2, dst3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *_src = (uint8_t *)src - 3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, tmp0,
+ tmp1, tmp2, tmp3);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst1, dst, 0, 1);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ int32_t loop_cnt = height >> 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+ dst0 = __lsx_vld(dst_tmp, 0);
+ dst1 = __lsx_vldx(dst_tmp, dst_stride);
+ dst_tmp += dst_stride2;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+ mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+ mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+ mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+ mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+ DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+ filter0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+ tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+ tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+ tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+ tmp7);
+ DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3);
+ DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3);
+ DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride);
+ dst += dst_stride2;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3, dst0, dst1;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1);
+ dst_tmp += dst_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+ mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+ mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+ mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+ mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+ DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+ filter0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+ tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+ tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+ tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+ tmp7);
+ DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ int32_t loop_cnt = height;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3, dst0, dst1;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+
+ DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+ src3 = __lsx_vld(src, 56);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+ __lsx_vst(out0, dst, 32);
+ __lsx_vst(out1, dst, 48);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, mask;
+ __m128i dst0, dst1, dst2, dst3, vec0, vec1, filt0;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ uint8_t *dst_tmp = dst;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec0, vec1);
+ vec0 = __lsx_vssrarni_bu_h(vec1, vec0, FILTER_BITS);
+ vec0 = __lsx_vavgr_bu(vec0, dst0);
+ __lsx_vstelm_w(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(vec0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(vec0, dst, 0, 3);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i vec4, vec5, vec6, vec7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *src_tmp1 = (uint8_t *)src + src_stride4;
+ uint8_t *dst_tmp = dst;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ src4 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_w, dst2, dst1, dst4, dst3, dst1, dst2);
+ dst1 = __lsx_vilvl_d(dst2, dst1);
+
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask, src7, src6, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+ FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+ res1, res2, res3);
+ DUP2_ARG2(__lsx_vilvl_d, res1, res0, res3, res2, res0, res2);
+ DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res2, dst1, res0, res2);
+
+ __lsx_vstelm_w(res0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 3);
+ dst += dst_stride;
+
+ __lsx_vstelm_w(res2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res2, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res2, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(res2, dst, 0, 3);
+ dst += dst_stride;
+}
+
+static void common_hz_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (height == 4) {
+ common_hz_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_hz_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, mask;
+ __m128i filt0, dst0, dst1, dst2, dst3;
+ __m128i vec0, vec1, vec2, vec3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ uint8_t *dst_tmp = dst;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec1);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec1, dst1, vec0, vec1);
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec1, dst, 0, 1);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ __m128i src0, src1, src2, src3, mask;
+ __m128i filt0, dst0, dst1, dst2, dst3;
+ __m128i vec0, vec1, vec2, vec3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ uint8_t *dst_tmp = dst;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec2);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+ DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 1);
+ dst += dst_stride;
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec2);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 1);
+ dst += dst_stride;
+
+ if (height == 16) {
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec2);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 1);
+ dst += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec2);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 1);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (height == 4) {
+ common_hz_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2) - 1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, dst0;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *src_tmp1 = (uint8_t *)src + 8;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ src1 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+ src_tmp1 += src_stride4;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ res0, res1, res2, res3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+ res4, res5, res6, res7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+ FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, res0,
+ res2, res4, res6);
+ dst0 = __lsx_vld(dst, 0);
+ res0 = __lsx_vavgr_bu(res0, dst0);
+ __lsx_vst(res0, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res2 = __lsx_vavgr_bu(res2, dst0);
+ __lsx_vst(res2, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res4 = __lsx_vavgr_bu(res4, dst0);
+ __lsx_vst(res4, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res6 = __lsx_vavgr_bu(res6, dst0);
+ __lsx_vst(res6, dst, 0);
+ dst += dst_stride;
+
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ src1 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+ src_tmp1 += src_stride4;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, res0, res1, res2, res3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, res4, res5, res6, res7);
+
+ DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+ FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+ res0, res2, res4, res6);
+ dst0 = __lsx_vld(dst, 0);
+ res0 = __lsx_vavgr_bu(res0, dst0);
+ __lsx_vst(res0, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res2 = __lsx_vavgr_bu(res2, dst0);
+ __lsx_vst(res2, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res4 = __lsx_vavgr_bu(res4, dst0);
+ __lsx_vst(res4, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res6 = __lsx_vavgr_bu(res6, dst0);
+ __lsx_vst(res6, dst, 0);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 1);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, dst0, dst1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vld, src, 16, src, 24, src2, src3);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ src += src_stride;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vld, src, 16, src, 24, src6, src7);
+ src5 = __lsx_vshuf_b(src6, src4, shuff);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, res0, res1, res2, res3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, res4, res5, res6, res7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+ FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+ res0, res2, res4, res6);
+
+ DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+ res0 = __lsx_vavgr_bu(res0, dst0);
+ __lsx_vst(res0, dst, 0);
+ res2 = __lsx_vavgr_bu(res2, dst1);
+ __lsx_vst(res2, dst, 16);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+ res4 = __lsx_vavgr_bu(res4, dst0);
+ __lsx_vst(res4, dst, 0);
+ res6 = __lsx_vavgr_bu(res6, dst1);
+ __lsx_vst(res6, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, dst0, dst1, dst2, dst3;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+ src6);
+ src7 = __lsx_vld(src, 56);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+ src5 = __lsx_vshuf_b(src6, src4, shuff);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ out0, out2, out4, out6);
+
+ DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst1, dst2,
+ dst3);
+ out0 = __lsx_vavgr_bu(out0, dst0);
+ __lsx_vst(out0, dst, 0);
+ out2 = __lsx_vavgr_bu(out2, dst1);
+ __lsx_vst(out2, dst, 16);
+ out4 = __lsx_vavgr_bu(out4, dst2);
+ __lsx_vst(out4, dst, 32);
+ out6 = __lsx_vavgr_bu(out6, dst3);
+ __lsx_vst(out6, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ int8_t cnt, filt_hor[8];
+
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_x) == 2) {
+ switch (w) {
+ case 4:
+ common_hz_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 8:
+ common_hz_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 16:
+ common_hz_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+
+ case 32:
+ common_hz_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 64:
+ common_hz_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_hz_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 8:
+ common_hz_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 16:
+ common_hz_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 32:
+ common_hz_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 64:
+ common_hz_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ default:
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
new file mode 100644
index 0000000000..d1abf622ad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
@@ -0,0 +1,737 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ __m128i out0, out1;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+ filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ src4 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+ _src += src_stride3;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+
+ tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+ filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+ DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+ tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+ src10 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ src2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src4 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src5 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3);
+ src2 = __lsx_vilvl_d(src3, src2);
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+ tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+ src0 = __lsx_vpackev_b(src1, src0);
+ out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS);
+ out0 = __lsx_vxori_b(out0, 128);
+ out0 = __lsx_vavgr_bu(out0, src2);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+
+ tmp5 = src1;
+ tmp0 = tmp2;
+ tmp1 = tmp4;
+ tmp2 = src0;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ __m128i out0, out1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+ filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ src4 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+ _src += src_stride3;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+
+ src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+ filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+ DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ tmp0, tmp1, tmp2, tmp4);
+ DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+ src10 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp3 = __lsx_vpackev_b(src7, src6);
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src0 = __lsx_vpackev_b(src8, src7);
+ out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src1 = __lsx_vpackev_b(src9, src8);
+ src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src2 = __lsx_vpackev_b(src10, src9);
+ src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3,
+ FILTER_BITS, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ src5 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src7 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src8 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src9 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7);
+ DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+
+ src6 = src10;
+ tmp0 = tmp2;
+ tmp1 = tmp3;
+ tmp2 = src1;
+ tmp4 = tmp6;
+ tmp5 = src0;
+ tmp6 = src2;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+ common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+
+ common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+
+ for (multiple8_cnt = 8; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+ /* rearranging filter */
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+ hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+
+ dst0 = __lsx_vldrepl_w(dst, 0);
+ dst1 = __lsx_vldrepl_w(dst + dst_stride, 0);
+ dst2 = __lsx_vldrepl_w(dst + dst_stride2, 0);
+ dst3 = __lsx_vldrepl_w(dst + dst_stride3, 0);
+ dst0 = __lsx_vilvl_w(dst1, dst0);
+ dst1 = __lsx_vilvl_w(dst3, dst2);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ __m128i hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src5, src6, src7, src8);
+ src += src_stride4;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+ hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+ hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
+ DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+ hz_out1, hz_out3);
+ hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+ hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst0 = __lsx_vilvl_w(dst1, dst0);
+ dst1 = __lsx_vilvl_w(dst3, dst2);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst1 = __lsx_vilvl_w(dst2, dst1);
+ dst2 = __lsx_vilvl_w(dst4, dst3);
+ dst1 = __lsx_vilvl_d(dst2, dst1);
+
+ DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+ hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+ filt_vt, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, res0, res1);
+ DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1);
+
+ __lsx_vstelm_w(res0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 3);
+ dst += dst_stride;
+
+ __lsx_vstelm_w(res1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ if (height == 4) {
+ common_hv_2ht_2vt_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ } else if (height == 8) {
+ common_hv_2ht_2vt_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ uint8_t *dst_tmp = dst;
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+ /* rearranging filter */
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
+ AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_hz, filt_vt, vec0;
+ __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ uint8_t *dst_tmp = dst;
+
+ /* rearranging filter */
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
+
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ if (height == 4) {
+ common_hv_2ht_2vt_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ } else {
+ common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+ src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint8_t *src_tmp1;
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, tmp3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride << 2;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+ /* rearranging filter */
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+ src_tmp1 = (uint8_t *)(src + 8);
+ src1 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+ src += src_stride4;
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+
+ hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp3 = __lsx_vavgr_bu(tmp3, dst0);
+ __lsx_vst(tmp3, dst, 0);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp3 = __lsx_vavgr_bu(tmp3, dst1);
+ __lsx_vstx(tmp3, dst, dst_stride);
+
+ hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp3 = __lsx_vavgr_bu(tmp3, dst2);
+ __lsx_vstx(tmp3, dst, dst_stride2);
+
+ hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp3 = __lsx_vavgr_bu(tmp3, dst3);
+ __lsx_vstx(tmp3, dst, dst_stride3);
+ dst += dst_stride4;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_32w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 16;
+ dst += 16;
+
+ common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_64w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_hor[8], filt_ver[8];
+
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ filt_ver[cnt] = filter_y[cnt];
+ }
+ if (vpx_get_filter_taps(filter_x) == 2 &&
+ vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_hv_2ht_2vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], h);
+ break;
+ case 8:
+ common_hv_2ht_2vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], h);
+ break;
+ case 16:
+ common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 32:
+ common_hv_2ht_2vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 64:
+ common_hv_2ht_2vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else if (vpx_get_filter_taps(filter_x) == 2 ||
+ vpx_get_filter_taps(filter_y) == 2) {
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ } else {
+ switch (w) {
+ case 4:
+ common_hv_8ht_8vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 8:
+ common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 16:
+ common_hv_8ht_8vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 32:
+ common_hv_8ht_8vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 64:
+ common_hv_8ht_8vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
new file mode 100644
index 0000000000..5c6413df44
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
@@ -0,0 +1,918 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ __m128i reg0, reg1, reg2, reg3, reg4;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i out0, out1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ src0 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+ src2);
+ src3 = __lsx_vldx(src_tmp0, src_stride3);
+ src_tmp0 += src_stride4;
+ src4 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+ src6);
+ src_tmp0 += src_stride3;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+ tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+ DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+ reg2 = __lsx_vilvl_d(tmp5, tmp2);
+ DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+ reg2 = __lsx_vxori_b(reg2, 128);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+ src9);
+ src10 = __lsx_vldx(src_tmp0, src_stride3);
+ src_tmp0 += src_stride4;
+ src0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1);
+ src0 = __lsx_vilvl_d(src1, src0);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+ DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
+ filter2, filter3);
+ out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
+ filter2, filter3);
+ out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+ out0 = __lsx_vxori_b(out0, 128);
+ out0 = __lsx_vavgr_bu(out0, src0);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+ reg0 = reg2;
+ reg1 = reg3;
+ reg2 = reg4;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i out0, out1, out2, out3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ src0 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+ src2);
+ src3 = __lsx_vldx(src_tmp0, src_stride3);
+ src_tmp0 += src_stride4;
+ src4 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+ src6);
+ src_tmp0 += src_stride3;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+ reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+ src9);
+ src10 = __lsx_vldx(src_tmp0, src_stride3);
+ src_tmp0 += src_stride4;
+ src0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1);
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ tmp0, tmp1, tmp2, tmp3);
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
+ filter2, filter3);
+ out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
+ filter2, filter3);
+ out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
+ filter2, filter3);
+ out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+ reg0 = reg2;
+ reg1 = tmp0;
+ reg2 = tmp2;
+ reg3 = reg5;
+ reg4 = tmp1;
+ reg5 = tmp3;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height, int32_t width) {
+ uint8_t *src_tmp;
+ uint32_t cnt = width >> 4;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+ uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ for (; cnt--;) {
+ uint32_t loop_cnt = height >> 2;
+ uint8_t *dst_reg = dst;
+
+ src_tmp = src_tmp0;
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+ src2);
+ src3 = __lsx_vldx(src_tmp, src_stride3);
+ src_tmp += src_stride4;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src_tmp += src_stride3;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ reg0, reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ reg6, reg7, reg8, reg9);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+ src9);
+ src10 = __lsx_vldx(src_tmp, src_stride3);
+ src_tmp += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+ src7, src8, src9, src10);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src4, src5, src7, src8);
+ tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ tmp2 = __lsx_vld(dst_reg, 0);
+ tmp3 = __lsx_vldx(dst_reg, dst_stride);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+ __lsx_vst(tmp0, dst_reg, 0);
+ __lsx_vstx(tmp1, dst_reg, dst_stride);
+ tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ tmp2 = __lsx_vldx(dst_reg, dst_stride2);
+ tmp3 = __lsx_vldx(dst_reg, dst_stride3);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+ __lsx_vstx(tmp0, dst_reg, dst_stride2);
+ __lsx_vstx(tmp1, dst_reg, dst_stride3);
+ dst_reg += dst_stride4;
+
+ reg0 = reg2;
+ reg1 = src0;
+ reg2 = src2;
+ reg3 = reg5;
+ reg4 = src1;
+ reg5 = src3;
+ reg6 = reg8;
+ reg7 = src4;
+ reg8 = src7;
+ reg9 = reg11;
+ reg10 = src5;
+ reg11 = src8;
+ src6 = src10;
+ }
+ src_tmp0 += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 64);
+}
+
+static void common_vt_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4;
+ __m128i dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+ __m128i src10_r, src32_r, src21_r, src43_r;
+ __m128i tmp0, tmp1;
+ uint8_t *dst_tmp = dst;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src4 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst0 = __lsx_vilvl_w(dst1, dst0);
+ dst1 = __lsx_vilvl_w(dst3, dst2);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+ src10_r, src21_r, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
+ src4332);
+ DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ out = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vstelm_w(out, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 3);
+ dst += dst_stride;
+}
+
+static void common_vt_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+ __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+ __m128i src2110, src4332, src6554, src8776, filt0;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ uint8_t *dst_tmp = dst;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src7 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src8 = __lsx_vld(src, 0);
+
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst0 = __lsx_vilvl_w(dst1, dst0);
+ dst1 = __lsx_vilvl_w(dst3, dst2);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst1 = __lsx_vilvl_w(dst2, dst1);
+ dst2 = __lsx_vilvl_w(dst4, dst3);
+ dst1 = __lsx_vilvl_d(dst2, dst1);
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+ src10_r, src21_r, src32_r, src43_r);
+ DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+ src54_r, src65_r, src76_r, src87_r);
+ DUP4_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+ src87_r, src76_r, src2110, src4332, src6554, src8776);
+ DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0,
+ src8776, filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 3);
+ dst += dst_stride;
+
+ __lsx_vstelm_w(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp2, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp2, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp2, dst, 0, 3);
+}
+
+static void common_vt_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (height == 4) {
+ common_vt_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_vt_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4;
+ __m128i dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ uint8_t *dst_tmp = dst;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec1);
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 3);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ uint8_t *dst_tmp = dst;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src5 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7);
+ src8 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst4 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst5 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst3, dst2, dst5, dst4, dst2, dst3);
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+ vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+ vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst2, tmp2, dst3, tmp0, tmp2);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+ dst += dst_stride;
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (height == 4) {
+ common_vt_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height);
+ }
+}
+
+static void common_vt_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i tmp0, tmp1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ uint8_t *src_tmp1;
+ uint8_t *dst_tmp1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ __m128i tmp0, tmp1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+
+ src_tmp1 = src + 16;
+ src6 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src7,
+ src8);
+ src9 = __lsx_vldx(src_tmp1, src_stride3);
+
+ dst_tmp1 = dst + 16;
+ dst4 = __lsx_vld(dst_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2, dst5,
+ dst6);
+ dst7 = __lsx_vldx(dst_tmp1, dst_stride3);
+ src += src_stride4;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vst(tmp0, dst, 0);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+ __lsx_vstx(tmp0, dst, dst_stride);
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+ __lsx_vstx(tmp0, dst, dst_stride2);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+ __lsx_vstx(tmp0, dst, dst_stride3);
+
+ DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+ __lsx_vst(tmp0, dst, 16);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+ dst += dst_stride;
+ __lsx_vst(tmp0, dst, 16);
+
+ DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+ dst += dst_stride;
+ __lsx_vst(tmp0, dst, 16);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+ dst += dst_stride;
+ __lsx_vst(tmp0, dst, 16);
+ dst += dst_stride;
+
+ src0 = src4;
+ src5 = src9;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 1);
+ int32_t src_stride2 = src_stride << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ uint8_t *src_tmp1;
+ uint8_t *dst_tmp1;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src6, src7, src8, src9, src10, src11, filt0;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i tmp0, tmp1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+ src9);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src2 = __lsx_vldx(src, src_stride);
+ dst1 = __lsx_vldx(dst, dst_stride);
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+ src10);
+ DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst2, dst4,
+ dst6);
+ src_tmp1 = (uint8_t *)src + 16;
+ src5 = __lsx_vldx(src_tmp1, src_stride);
+ src_tmp1 = src_tmp1 + 16;
+ src8 = __lsx_vldx(src_tmp1, src_stride);
+ src_tmp1 = src_tmp1 + 16;
+ src11 = __lsx_vldx(src_tmp1, src_stride);
+
+ dst_tmp1 = dst + 16;
+ dst3 = __lsx_vldx(dst_tmp1, dst_stride);
+ dst_tmp1 = dst + 32;
+ dst5 = __lsx_vldx(dst_tmp1, dst_stride);
+ dst_tmp1 = dst + 48;
+ dst7 = __lsx_vldx(dst_tmp1, dst_stride);
+ src += src_stride2;
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vst(tmp0, dst, 0);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+ __lsx_vstx(tmp0, dst, dst_stride);
+
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+ __lsx_vst(tmp0, dst, 16);
+
+ dst_tmp1 = dst + 16;
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+ __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+ __lsx_vst(tmp0, dst, 32);
+
+ dst_tmp1 = dst_tmp1 + 16;
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+ __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+ __lsx_vst(tmp0, dst, 48);
+
+ dst_tmp1 = dst_tmp1 + 16;
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+ __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+ dst += dst_stride2;
+
+ src0 = src2;
+ src3 = src5;
+ src6 = src8;
+ src9 = src11;
+ }
+}
+
+void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_ver[8];
+
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_vt_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 8:
+ common_vt_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 16:
+ common_vt_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 32:
+ common_vt_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 64:
+ common_vt_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_vt_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 8:
+ common_vt_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 16:
+ common_vt_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+
+ break;
+ case 32:
+ common_vt_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 64:
+ common_vt_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
new file mode 100644
index 0000000000..2c6459a978
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
@@ -0,0 +1,814 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter) {
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out, out0, out1;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, out0, out1);
+ out = __lsx_vssrarni_b_h(out1, out0, 7);
+ out = __lsx_vxori_b(out, 128);
+ __lsx_vstelm_w(out, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 3);
+}
+
+static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter) {
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+ uint8_t *_src = (uint8_t *)src - 3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, out0, out1);
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter) {
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, out0, out1,
+ out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+}
+
+static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+ uint8_t *_src = (uint8_t *)src - 3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 1;
+ int32_t stride = src_stride << 1;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; loop_cnt--;) {
+ const uint8_t *_src = src + src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2);
+ DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vst(out0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out1, dst, 0);
+ dst += dst_stride;
+ src += stride;
+ }
+}
+
+static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 1;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ src += src_stride;
+
+ dst += dst_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ int32_t loop_cnt = height;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+
+ DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+ src3 = __lsx_vld(src, 56);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vst(out0, dst, 32);
+ __lsx_vst(out1, dst, 48);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, mask;
+ __m128i filt0, vec0, vec1, vec2, vec3, res0, res1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride + dst_stride2;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3,
+ FILTER_BITS, res0, res1);
+
+ __lsx_vstelm_w(res0, dst, 0, 0);
+ __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i res0, res1, res2, res3, filt0;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride + dst_stride2;
+
+ uint8_t *src_tmp1 = src + src_stride4;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
+ src7, src6, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+ FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+ res1, res2, res3);
+
+ __lsx_vstelm_w(res0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 1);
+ dst += dst_stride;
+
+ __lsx_vstelm_w(res2, dst, 0, 0);
+ __lsx_vstelm_w(res2, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_hz_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_hz_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i filt0, mask;
+ __m128i src0, src1, src2, src3;
+ __m128i vec0, vec1, vec2, vec3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+ src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec1);
+
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ __m128i filt0, mask;
+ __m128i src0, src1, src2, src3, out0, out1;
+ __m128i vec0, vec1, vec2, vec3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+ src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+ src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+
+ if (height == 16) {
+ uint8_t *dst_tmp1 = dst + dst_stride4;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+ mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+ mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst_tmp1, 0, 0);
+ __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst_tmp1 + dst_stride2, 0, 0);
+ __lsx_vstelm_d(out1, dst_tmp1 + dst_stride3, 0, 1);
+ }
+}
+
+static void common_hz_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_hz_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2) - 1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ uint8_t *src_tmp1 = src + 8;
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+ src1 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+ src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, mask,
+ src7, src7, mask, vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+ out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0,
+ out1, out2, out3);
+
+ __lsx_vst(out0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out3, dst, 0);
+ dst += dst_stride;
+
+ for (; loop_cnt--;) {
+ src_tmp1 += src_stride4;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+
+ src1 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+ mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+ mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ out0, out1, out2, out3);
+
+ __lsx_vst(out0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out3, dst, 0);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 1);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src4, src6);
+ src7 = __lsx_vld(src, 24);
+ src5 = __lsx_vshuf_b(src6, src4, shuff);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+ mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+ mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ out0, out1, out2, out3);
+
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+ dst += dst_stride;
+
+ __lsx_vst(out2, dst, 0);
+ __lsx_vst(out3, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+ src6);
+ src7 = __lsx_vld(src, 56);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+ src5 = __lsx_vshuf_b(src6, src4, shuff);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+ mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+ mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ out0, out1, out2, out3);
+
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+ __lsx_vst(out2, dst, 32);
+ __lsx_vst(out3, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ int8_t cnt, filt_hor[8];
+
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ }
+ if (vpx_get_filter_taps(filter_x) == 2) {
+ switch (w) {
+ case 4:
+ common_hz_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 8:
+ common_hz_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 16:
+ common_hz_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 32:
+ common_hz_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 64:
+ common_hz_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ default:
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_hz_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 8:
+ common_hz_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+
+ case 16:
+ common_hz_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+
+ case 32:
+ common_hz_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+
+ case 64:
+ common_hz_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ default:
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c
new file mode 100644
index 0000000000..9f5cd6cfe9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ __m128i out0, out1;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ src -= (3 + 3 * src_stride);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+ filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+ src4 = __lsx_vld(src, 0);
+ src += src_stride;
+ src5 = __lsx_vld(src, 0);
+ src += src_stride;
+ src6 = __lsx_vld(src, 0);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+
+ tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+ filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+ DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+ tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+
+ for (; loop_cnt--;) {
+ LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+ tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+ src0 = __lsx_vpackev_b(src1, src0);
+ out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+ out0 = __lsx_vxori_b(out0, 128);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+
+ tmp5 = src1;
+ tmp0 = tmp2;
+ tmp1 = tmp4;
+ tmp2 = src0;
+ }
+}
+
+static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ __m128i out0, out1;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= (3 + 3 * src_stride);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+ filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+ src4 = __lsx_vld(src, 0);
+ src += src_stride;
+ src5 = __lsx_vld(src, 0);
+ src += src_stride;
+ src6 = __lsx_vld(src, 0);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+
+ src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+ filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+ DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ tmp0, tmp1, tmp2, tmp4);
+ DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+ for (; loop_cnt--;) {
+ LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp3 = __lsx_vpackev_b(src7, src6);
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src0 = __lsx_vpackev_b(src8, src7);
+ out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src1 = __lsx_vpackev_b(src9, src8);
+ src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src2 = __lsx_vpackev_b(src10, src9);
+ src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+
+ src6 = src10;
+ tmp0 = tmp2;
+ tmp1 = tmp3;
+ tmp2 = src1;
+ tmp4 = tmp6;
+ tmp5 = src0;
+ tmp6 = src2;
+ }
+}
+
+static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+
+ common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+}
+
+static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 8; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_vt, filt_hz, vec0, vec1;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+
+ hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+ hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1,
+ FILTER_BITS, tmp0, tmp1);
+
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src5, src6, src7, src8);
+ src += src_stride4;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+ hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+ hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
+
+ DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+ hz_out1, hz_out3);
+ hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+ hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+ DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+ hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+ filt_vt, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+ FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4,
+ vec5, vec6, vec7);
+
+ __lsx_vstelm_w(vec4, dst, 0, 0);
+ __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1);
+ dst += dst_stride4;
+ __lsx_vstelm_w(vec6, dst, 0, 0);
+ __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ if (height == 4) {
+ common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else if (height == 8) {
+ common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
+
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt = (height >> 3);
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_hz, filt_vt, vec0;
+ __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+ FILTER_BITS, tmp1, tmp2);
+
+ __lsx_vstelm_d(tmp1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+ dst += dst_stride;
+
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+ FILTER_BITS, tmp1, tmp2);
+
+ __lsx_vstelm_d(tmp1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ if (height == 4) {
+ common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else {
+ common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ }
+}
+
+static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1;
+ __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+
+ for (; loop_cnt--;) {
+ uint8_t *src_tmp0 = src + 8;
+
+ DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src,
+ src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5);
+ DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7);
+ src += src_stride4;
+
+ hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+
+ common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+}
+
+static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int32_t x_step_q4, int y0_q4,
+ int32_t y_step_q4, int32_t w, int32_t h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_hor[8], filt_ver[8];
+
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_x) == 2 &&
+ vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 8:
+ common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 16:
+ common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 32:
+ common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 64:
+ common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ default:
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else if (vpx_get_filter_taps(filter_x) == 2 ||
+ vpx_get_filter_taps(filter_y) == 2) {
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h);
+ } else {
+ switch (w) {
+ case 4:
+ common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 8:
+ common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 16:
+ common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 32:
+ common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 64:
+ common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ default:
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
new file mode 100644
index 0000000000..6022e43c83
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
@@ -0,0 +1,825 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ __m128i reg0, reg1, reg2, reg3, reg4;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i out0, out1;
+ uint8_t *_src = (uint8_t *)src - src_stride3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ src4 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+ _src += src_stride3;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+ tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+ DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+ reg2 = __lsx_vilvl_d(tmp5, tmp2);
+ DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+ reg2 = __lsx_vxori_b(reg2, 128);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+ src10 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+ DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
+ filter2, filter3);
+ out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
+ filter2, filter3);
+ out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+ out0 = __lsx_vxori_b(out0, 128);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+
+ reg0 = reg2;
+ reg1 = reg3;
+ reg2 = reg4;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i out0, out1, out2, out3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ src = src - src_stride3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src += src_stride3;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+ reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+ src10 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ tmp0, tmp1, tmp2, tmp3);
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
+ filter2, filter3);
+ out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
+ filter2, filter3);
+ out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
+ filter2, filter3);
+ out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+
+ reg0 = reg2;
+ reg1 = tmp0;
+ reg2 = tmp2;
+ reg3 = reg5;
+ reg4 = tmp1;
+ reg5 = tmp3;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ // uint8_t *_src = (uint8_t *)src - src_stride3;
+ src -= src_stride3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src += src_stride3;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+ reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, reg6,
+ reg7, reg8, reg9);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+ src10 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src4, src5, src7, src8);
+ tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(tmp1, dst, 0);
+ dst += dst_stride;
+ tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(tmp1, dst, 0);
+ dst += dst_stride;
+
+ reg0 = reg2;
+ reg1 = src0;
+ reg2 = src2;
+ reg3 = reg5;
+ reg4 = src1;
+ reg5 = src3;
+ reg6 = reg8;
+ reg7 = src4;
+ reg8 = src7;
+ reg9 = reg11;
+ reg10 = src5;
+ reg11 = src8;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height,
+ int32_t width) {
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ uint32_t cnt = width >> 4;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+ src -= src_stride3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; cnt--;) {
+ uint32_t loop_cnt = height >> 2;
+
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+ src2);
+ src3 = __lsx_vldx(src_tmp, src_stride3);
+ src_tmp += src_stride4;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src_tmp += src_stride3;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ reg0, reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ reg6, reg7, reg8, reg9);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+ src9);
+ src10 = __lsx_vldx(src_tmp, src_stride3);
+ src_tmp += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+ src7, src8, src9, src10);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src4, src5, src7, src8);
+ tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vst(tmp0, dst_tmp, 0);
+ __lsx_vstx(tmp1, dst_tmp, dst_stride);
+ tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vstx(tmp0, dst_tmp, dst_stride2);
+ __lsx_vstx(tmp1, dst_tmp, dst_stride3);
+ dst_tmp += dst_stride4;
+
+ reg0 = reg2;
+ reg1 = src0;
+ reg2 = src2;
+ reg3 = reg5;
+ reg4 = src1;
+ reg5 = src3;
+ reg6 = reg8;
+ reg7 = src4;
+ reg8 = src7;
+ reg9 = reg11;
+ reg10 = src5;
+ reg11 = src8;
+ src6 = src10;
+ }
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+ 32);
+}
+
+static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+ 64);
+}
+
+static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+ __m128i filt0, tmp0, tmp1;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += (src_stride4 + src_stride);
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+ vec1, vec2, vec3);
+ DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+ __m128i vec6, vec7, vec8, vec9, vec10, vec11;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i filt0;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+ uint8_t *dst_tmp1 = dst + dst_stride4;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src5, src6, src7, src8);
+ src += (src_stride4 + src_stride);
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+ vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4,
+ vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8,
+ vec9, vec10, vec11);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
+
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+
+ __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0);
+ __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_vt_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_vt_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+ __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+ vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+}
+
+static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 3);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src5 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7)
+ src8 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+ vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+ vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+ dst += dst_stride4;
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+ dst += dst_stride4;
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_vt_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ __m128i tmp, tmp0, tmp1;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ uint8_t *src_tmp;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
+ src += src_stride;
+ src_tmp = src + 16;
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src1, src6);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+ src_stride2, src_tmp, src_stride2, src2, src7, src3, src8);
+ DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src4, src9);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ src += src_stride4;
+ src_tmp += src_stride4;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vstx(tmp, dst, dst_stride);
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vstx(tmp, dst, dst_stride2);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vstx(tmp, dst, dst_stride3);
+
+ DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 16);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ dst += dst_stride;
+ __lsx_vst(tmp, dst, 16);
+
+ DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ dst += dst_stride;
+ __lsx_vst(tmp, dst, 16);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ dst += dst_stride;
+ __lsx_vst(tmp, dst, 16);
+
+ dst += dst_stride;
+
+ src0 = src4;
+ src5 = src9;
+ }
+}
+
+static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 1);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ __m128i tmp, tmp0, tmp1;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ uint8_t *dst_tmp1 = dst + dst_stride;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+ src9);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ uint8_t *src_tmp0 = src + src_stride;
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+ src10);
+ DUP4_ARG2(__lsx_vld, src_tmp0, 0, src_tmp0, 16, src_tmp0, 32, src_tmp0, 48,
+ src2, src5, src8, src11);
+ src += src_stride2;
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst_tmp1, 0);
+
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 16);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst_tmp1, 16);
+
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 32);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst_tmp1, 32);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 48);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst_tmp1, 48);
+ dst += dst_stride2;
+ dst_tmp1 += dst_stride2;
+
+ src0 = src2;
+ src3 = src5;
+ src6 = src8;
+ src9 = src11;
+ }
+}
+
+void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_ver[8];
+
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 8; cnt--;) {
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_vt_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 8:
+ common_vt_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 16:
+ common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 32:
+ common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 64:
+ common_vt_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_vt_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 8:
+ common_vt_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 16:
+ common_vt_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 32:
+ common_vt_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 64:
+ common_vt_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
new file mode 100644
index 0000000000..1dad29eeed
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void avg_width4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ __m128i src0, src1;
+ __m128i dst0, dst1;
+
+ int32_t src_stride2 = src_stride << 1;
+
+ if ((height % 2) == 0) {
+ for (cnt = (height / 2); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ src1 = __lsx_vldx(src, src_stride);
+ src += src_stride2;
+
+ dst0 = __lsx_vld(dst, 0);
+ dst1 = __lsx_vldx(dst, dst_stride);
+ DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst1, dst, 0, 0);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int32_t height) {
+ int32_t cnt = (height / 4);
+ __m128i src0, src1, src2, src3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ for (; cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+
+ DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst3, dst, 0, 0);
+ dst += dst_stride;
+ }
+}
+
+static void avg_width16_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt = (height / 8);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ for (; cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src7 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+ dst4 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst5, dst6);
+ dst7 = __lsx_vldx(dst, dst_stride3);
+ dst -= dst_stride4;
+
+ DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+ dst4, dst5, dst6, dst7);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride);
+ __lsx_vstx(dst2, dst, dst_stride2);
+ __lsx_vstx(dst3, dst, dst_stride3);
+ dst += dst_stride4;
+ __lsx_vst(dst4, dst, 0);
+ __lsx_vstx(dst5, dst, dst_stride);
+ __lsx_vstx(dst6, dst, dst_stride2);
+ __lsx_vstx(dst7, dst, dst_stride3);
+ dst += dst_stride4;
+ }
+}
+
+static void avg_width32_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt = (height / 8);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ for (; cnt--;) {
+ uint8_t *dst_tmp = dst;
+ uint8_t *dst_tmp1 = dst_tmp + 16;
+ uint8_t *src_tmp = src + 16;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src0, src1);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+ src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+ DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src6, src7);
+ src += src_stride4;
+
+ DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst0, dst1);
+ DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+ dst_stride2, dst_tmp1, dst_stride2, dst2, dst3, dst4, dst5);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst6,
+ dst7);
+ dst_tmp += dst_stride4;
+ dst_tmp1 += dst_stride4;
+
+ src_tmp = src + 16;
+ DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src8, src9);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+ src_stride2, src_tmp, src_stride2, src10, src11, src12, src13);
+ DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src14, src15);
+ src += src_stride4;
+
+ DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst8, dst9);
+ DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+ dst_stride2, dst_tmp1, dst_stride2, dst10, dst11, dst12, dst13);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst14,
+ dst15);
+ DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+ dst4, dst5, dst6, dst7);
+ DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+ dst11, dst8, dst9, dst10, dst11);
+ DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+ dst15, dst12, dst13, dst14, dst15);
+
+ dst_tmp = dst + 16;
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst2, dst, dst_stride);
+ __lsx_vstx(dst4, dst, dst_stride2);
+ __lsx_vstx(dst6, dst, dst_stride3);
+ __lsx_vst(dst1, dst_tmp, 0);
+ __lsx_vstx(dst3, dst_tmp, dst_stride);
+ __lsx_vstx(dst5, dst_tmp, dst_stride2);
+ __lsx_vstx(dst7, dst_tmp, dst_stride3);
+ dst += dst_stride4;
+
+ __lsx_vst(dst8, dst, 0);
+ __lsx_vstx(dst10, dst, dst_stride);
+ __lsx_vstx(dst12, dst, dst_stride2);
+ __lsx_vstx(dst14, dst, dst_stride3);
+ __lsx_vst(dst9, dst_tmp1, 0);
+ __lsx_vstx(dst11, dst_tmp1, dst_stride);
+ __lsx_vstx(dst13, dst_tmp1, dst_stride2);
+ __lsx_vstx(dst15, dst_tmp1, dst_stride3);
+ dst += dst_stride4;
+ }
+}
+
+static void avg_width64_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt = (height / 4);
+ uint8_t *dst_tmp = dst;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ for (; cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+ src7);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src8, src9, src10,
+ src11);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src12, src13, src14,
+ src15);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+ dst0, dst1, dst2, dst3);
+ dst_tmp += dst_stride;
+ DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+ dst4, dst5, dst6, dst7);
+ dst_tmp += dst_stride;
+ DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+ dst8, dst9, dst10, dst11);
+ dst_tmp += dst_stride;
+ DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+ dst12, dst13, dst14, dst15);
+ dst_tmp += dst_stride;
+
+ DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+ dst4, dst5, dst6, dst7);
+ DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+ dst11, dst8, dst9, dst10, dst11);
+ DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+ dst15, dst12, dst13, dst14, dst15);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ __lsx_vst(dst3, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(dst4, dst, 0);
+ __lsx_vst(dst5, dst, 16);
+ __lsx_vst(dst6, dst, 32);
+ __lsx_vst(dst7, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(dst8, dst, 0);
+ __lsx_vst(dst9, dst, 16);
+ __lsx_vst(dst10, dst, 32);
+ __lsx_vst(dst11, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(dst12, dst, 0);
+ __lsx_vst(dst13, dst, 16);
+ __lsx_vst(dst14, dst, 32);
+ __lsx_vst(dst15, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+ int32_t w, int32_t h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ switch (w) {
+ case 4: {
+ avg_width4_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+
+ case 8: {
+ avg_width8_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ avg_width16_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ avg_width32_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ avg_width64_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ int32_t lp, cnt;
+ for (cnt = h; cnt--;) {
+ for (lp = 0; lp < w; ++lp) {
+ dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
new file mode 100644
index 0000000000..53dc7097ed
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void copy_width8_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ if ((height % 12) == 0) {
+ for (cnt = (height / 12); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src += src_stride2;
+ src7 = __lsx_vldx(src, src_stride);
+ src += src_stride2;
+
+ __lsx_vstelm_d(src0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src3, dst, 0, 0);
+ dst += dst_stride;
+
+ __lsx_vstelm_d(src4, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src5, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src6, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src7, dst, 0, 0);
+ dst += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ __lsx_vstelm_d(src0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src3, dst, 0, 0);
+ dst += dst_stride;
+ }
+ } else if ((height % 8) == 0) {
+ for (cnt = height >> 3; cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src += src_stride2;
+ src7 = __lsx_vldx(src, src_stride);
+ src += src_stride2;
+
+ __lsx_vstelm_d(src0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src3, dst, 0, 0);
+ dst += dst_stride;
+
+ __lsx_vstelm_d(src4, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src5, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src6, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src7, dst, 0, 0);
+ dst += dst_stride;
+ }
+ } else if ((height % 4) == 0) {
+ for (cnt = (height / 4); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ __lsx_vstelm_d(src0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src3, dst, 0, 0);
+ dst += dst_stride;
+ }
+ } else if ((height % 2) == 0) {
+ for (cnt = (height / 2); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ src1 = __lsx_vldx(src, src_stride);
+ src += src_stride2;
+
+ __lsx_vstelm_d(src0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src1, dst, 0, 0);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void copy_16multx8mult_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height, int32_t width) {
+ int32_t cnt, loop_cnt;
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = (uint8_t *)src;
+ dst_tmp = dst;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP4_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src_tmp,
+ src_stride3, src_tmp, src_stride4, src1, src2, src3, src4);
+ src_tmp += src_stride4;
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src_tmp += src_stride2;
+ src7 = __lsx_vldx(src_tmp, src_stride);
+ src_tmp += src_stride2;
+
+ __lsx_vst(src0, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src1, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src2, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src3, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src4, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src5, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src6, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src7, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ }
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void copy_width16_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ if ((height % 12) == 0) {
+ for (cnt = (height / 12); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src += src_stride2;
+ src7 = __lsx_vldx(src, src_stride);
+ src += src_stride2;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src4, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src5, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src6, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src7, dst, 0);
+ dst += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+ }
+ } else if ((height % 8) == 0) {
+ copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 16);
+ } else if ((height % 4) == 0) {
+ for (cnt = (height >> 2); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void copy_width32_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ if ((height % 12) == 0) {
+ for (cnt = (height / 12); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ src_tmp = (uint8_t *)src + 16;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+
+ dst_tmp = dst + 16;
+ __lsx_vst(src4, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src5, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src6, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src7, dst_tmp, 0);
+ dst_tmp += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ src_tmp = (uint8_t *)src + 16;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+
+ dst_tmp = dst + 16;
+ __lsx_vst(src4, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src5, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src6, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src7, dst_tmp, 0);
+ dst_tmp += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ src_tmp = (uint8_t *)src + 16;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+
+ dst_tmp = dst + 16;
+ __lsx_vst(src4, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src5, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src6, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src7, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ }
+ } else if ((height % 8) == 0) {
+ copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 32);
+ } else if ((height % 4) == 0) {
+ for (cnt = (height >> 2); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ src_tmp = (uint8_t *)src + 16;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+
+ dst_tmp = dst + 16;
+ __lsx_vst(src4, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src5, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src6, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src7, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ }
+ }
+}
+
+static void copy_width64_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+ int32_t w, int32_t h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ switch (w) {
+ case 4: {
+ uint32_t cnt;
+ __m128i tmp;
+ for (cnt = h; cnt--;) {
+ tmp = __lsx_vldrepl_w(src, 0);
+ __lsx_vstelm_w(tmp, dst, 0, 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ case 8: {
+ copy_width8_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ copy_width16_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ copy_width32_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ copy_width64_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ uint32_t cnt;
+ for (cnt = h; cnt--;) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h
new file mode 100644
index 0000000000..d886b00198
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1,
+ __m128i _reg2, __m128i _reg3,
+ __m128i _filter0, __m128i _filter1,
+ __m128i _filter2, __m128i _filter3) {
+ __m128i _vec0, _vec1;
+
+ _vec0 = __lsx_vdp2_h_b(_reg0, _filter0);
+ _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1);
+ _vec1 = __lsx_vdp2_h_b(_reg2, _filter2);
+ _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3);
+ return __lsx_vsadd_h(_vec0, _vec1);
+}
+
+static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1,
+ __m128i _mask0, __m128i _mask1,
+ __m128i _mask2, __m128i _mask3,
+ __m128i _filt_h0, __m128i _filt_h1,
+ __m128i _filt_h2, __m128i _filt_h3) {
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+ __m128i _out;
+
+ DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,
+ _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);
+ _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1,
+ _filt_h2, _filt_h3);
+ _out = __lsx_vsrari_h(_out, FILTER_BITS);
+ return __lsx_vsat_h(_out, 7);
+}
+
+static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask,
+ __m128i coeff) {
+ __m128i tmp0_m, tmp1_m;
+
+ tmp0_m = __lsx_vshuf_b(in1, in0, mask);
+ tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);
+ return __lsx_vsrari_h(tmp1_m, FILTER_BITS);
+}
+
+#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
+ do { \
+ _src0 = __lsx_vld(_src, 0); \
+ _src += _stride; \
+ _src1 = __lsx_vld(_src, 0); \
+ _src += _stride; \
+ _src2 = __lsx_vld(_src, 0); \
+ _src += _stride; \
+ _src3 = __lsx_vld(_src, 0); \
+ } while (0)
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \
+ _mask2, _mask3, _filter0, _filter1, \
+ _filter2, _filter3, _out0, _out1) \
+ do { \
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
+ __m128i _reg0, _reg1, _reg2, _reg3; \
+ \
+ DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0, \
+ _tmp0, _tmp1); \
+ DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \
+ DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1, \
+ _tmp2, _tmp3); \
+ DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3, \
+ _filter1, _reg0, _reg1); \
+ DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2, \
+ _tmp4, _tmp5); \
+ DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \
+ DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3, \
+ _tmp6, _tmp7); \
+ DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \
+ _filter3, _reg2, _reg3); \
+ DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \
+ } while (0)
+
+#define HORIZ_8TAP_8WID_4VECS_FILT( \
+ _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, \
+ _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3) \
+ do { \
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
+ __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \
+ \
+ DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, \
+ _src2, _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, \
+ _tmp3); \
+ DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2, \
+ _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3); \
+ DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, \
+ _src2, _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, \
+ _tmp3); \
+ DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2, \
+ _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7); \
+ DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, \
+ _src2, _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, \
+ _tmp7); \
+ DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5, \
+ _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \
+ _reg1, _reg2, _reg3); \
+ DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, \
+ _src2, _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, \
+ _tmp7); \
+ DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5, \
+ _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \
+ _reg5, _reg6, _reg7); \
+ DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \
+ _reg7, _out0, _out1, _out2, _out3); \
+ } while (0)
+
+#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \
+ do { \
+ __m128i tmp0_m, tmp1_m; \
+ \
+ DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \
+ __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \
+ pdst += stride; \
+ __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \
+ pdst += stride; \
+ __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \
+ pdst += stride; \
+ __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \
+ } while (0)
+
+#endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loopfilter.c b/media/libvpx/libvpx/vpx_dsp/loopfilter.c
new file mode 100644
index 0000000000..d6504aab1f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loopfilter.c
@@ -0,0 +1,743 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+static INLINE int8_t signed_char_clamp(int t) {
+ return (int8_t)clamp(t, -128, 127);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+ switch (bd) {
+ case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
+ case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
+ case 8:
+ default: return (int16_t)clamp(t, -128, 128 - 1);
+ }
+}
+#endif
+
+// Should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
+ uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+ uint8_t q1, uint8_t q2, uint8_t q3) {
+ int8_t mask = 0;
+ mask |= (abs(p3 - p2) > limit) * -1;
+ mask |= (abs(p2 - p1) > limit) * -1;
+ mask |= (abs(p1 - p0) > limit) * -1;
+ mask |= (abs(q1 - q0) > limit) * -1;
+ mask |= (abs(q2 - q1) > limit) * -1;
+ mask |= (abs(q3 - q2) > limit) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
+ uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
+ uint8_t q2, uint8_t q3) {
+ int8_t mask = 0;
+ mask |= (abs(p1 - p0) > thresh) * -1;
+ mask |= (abs(q1 - q0) > thresh) * -1;
+ mask |= (abs(p2 - p0) > thresh) * -1;
+ mask |= (abs(q2 - q0) > thresh) * -1;
+ mask |= (abs(p3 - p0) > thresh) * -1;
+ mask |= (abs(q3 - q0) > thresh) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
+ uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+ uint8_t q1, uint8_t q2, uint8_t q3,
+ uint8_t q4) {
+ int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
+ mask |= (abs(p4 - p0) > thresh) * -1;
+ mask |= (abs(q4 - q0) > thresh) * -1;
+ return ~mask;
+}
+
+// Is there high edge variance internal edge: 11111111 yes, 00000000 no
+static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
+ uint8_t q0, uint8_t q1) {
+ int8_t hev = 0;
+ hev |= (abs(p1 - p0) > thresh) * -1;
+ hev |= (abs(q1 - q0) > thresh) * -1;
+ return hev;
+}
+
+static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
+ uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+ int8_t filter1, filter2;
+
+ const int8_t ps1 = (int8_t)(*op1 ^ 0x80);
+ const int8_t ps0 = (int8_t)(*op0 ^ 0x80);
+ const int8_t qs0 = (int8_t)(*oq0 ^ 0x80);
+ const int8_t qs1 = (int8_t)(*oq1 ^ 0x80);
+ const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
+
+ // add outer taps if we have high edge variance
+ int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
+
+ // inner taps
+ filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
+
+ // save bottom 3 bits so that we round one side +4 and the other +3
+ // if it equals 4 we'll set it to adjust by -1 to account for the fact
+ // we'd round it by 3 the other way
+ filter1 = signed_char_clamp(filter + 4) >> 3;
+ filter2 = signed_char_clamp(filter + 3) >> 3;
+
+ *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80);
+ *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80);
+
+ // outer tap adjustments
+ filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+ *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80);
+ *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80);
+}
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+ const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+ p0 = s[-pitch];
+ const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+ q3 = s[3 * pitch];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch);
+ ++s;
+ }
+}
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0);
+ vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1);
+}
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
+ s += pitch;
+ }
+}
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
+ vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
+ uint8_t *op3, uint8_t *op2, uint8_t *op1,
+ uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
+ uint8_t *oq2, uint8_t *oq3) {
+ if (flat && mask) {
+ const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+ // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+ *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+ *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+ *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+ *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+ *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+ *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+ } else {
+ filter4(mask, thresh, op1, op0, oq0, oq1);
+ }
+}
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+ const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+ p0 = s[-pitch];
+ const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+ q3 = s[3 * pitch];
+
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch,
+ s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch);
+ ++s;
+ }
+}
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0);
+ vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1);
+}
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+ s + 3);
+ s += pitch;
+ }
+}
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
+ vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat,
+ int8_t flat2, uint8_t *op7, uint8_t *op6,
+ uint8_t *op5, uint8_t *op4, uint8_t *op3,
+ uint8_t *op2, uint8_t *op1, uint8_t *op0,
+ uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
+ uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
+ uint8_t *oq6, uint8_t *oq7) {
+ if (flat2 && flat && mask) {
+ const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
+ p2 = *op2, p1 = *op1, p0 = *op0;
+
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
+ q5 = *oq5, q6 = *oq6, q7 = *oq7;
+
+ // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+ *op6 = ROUND_POWER_OF_TWO(
+ p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
+ *op5 = ROUND_POWER_OF_TWO(
+ p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
+ *op4 = ROUND_POWER_OF_TWO(
+ p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
+ *op3 = ROUND_POWER_OF_TWO(
+ p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
+ *op2 = ROUND_POWER_OF_TWO(
+ p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
+ 4);
+ *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+ q0 + q1 + q2 + q3 + q4 + q5,
+ 4);
+ *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
+ q1 + q2 + q3 + q4 + q5 + q6,
+ 4);
+ *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
+ q2 + q3 + q4 + q5 + q6 + q7,
+ 4);
+ *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
+ q3 + q4 + q5 + q6 + q7 * 2,
+ 4);
+ *oq2 = ROUND_POWER_OF_TWO(
+ p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
+ 4);
+ *oq3 = ROUND_POWER_OF_TWO(
+ p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+ *oq4 = ROUND_POWER_OF_TWO(
+ p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+ *oq5 = ROUND_POWER_OF_TWO(
+ p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+ *oq6 = ROUND_POWER_OF_TWO(
+ p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+ } else {
+ filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+ }
+}
+
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int count) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8 * count; ++i) {
+ const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+ p0 = s[-pitch];
+ const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+ q3 = s[3 * pitch];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat2 = flat_mask5(
+ 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+ s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]);
+
+ filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+ s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+ s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch,
+ s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch,
+ s + 7 * pitch);
+ ++s;
+ }
+}
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1);
+}
+
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2);
+}
+
+static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ int i;
+
+ for (i = 0; i < count; ++i) {
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4],
+ s[5], s[6], s[7]);
+
+ filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
+ s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
+ s + 7);
+ s += pitch;
+ }
+}
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8);
+}
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
+ uint16_t p3, uint16_t p2, uint16_t p1,
+ uint16_t p0, uint16_t q0, uint16_t q1,
+ uint16_t q2, uint16_t q3, int bd) {
+ int8_t mask = 0;
+ int16_t limit16 = (uint16_t)limit << (bd - 8);
+ int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+ mask |= (abs(p3 - p2) > limit16) * -1;
+ mask |= (abs(p2 - p1) > limit16) * -1;
+ mask |= (abs(p1 - p0) > limit16) * -1;
+ mask |= (abs(q1 - q0) > limit16) * -1;
+ mask |= (abs(q2 - q1) > limit16) * -1;
+ mask |= (abs(q3 - q2) > limit16) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
+ uint16_t p1, uint16_t p0, uint16_t q0,
+ uint16_t q1, uint16_t q2, uint16_t q3,
+ int bd) {
+ int8_t mask = 0;
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ mask |= (abs(p1 - p0) > thresh16) * -1;
+ mask |= (abs(q1 - q0) > thresh16) * -1;
+ mask |= (abs(p2 - p0) > thresh16) * -1;
+ mask |= (abs(q2 - q0) > thresh16) * -1;
+ mask |= (abs(p3 - p0) > thresh16) * -1;
+ mask |= (abs(q3 - q0) > thresh16) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
+ uint16_t p2, uint16_t p1, uint16_t p0,
+ uint16_t q0, uint16_t q1, uint16_t q2,
+ uint16_t q3, uint16_t q4, int bd) {
+ int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ mask |= (abs(p4 - p0) > thresh16) * -1;
+ mask |= (abs(q4 - q0) > thresh16) * -1;
+ return ~mask;
+}
+
+// Is there high edge variance internal edge:
+// 11111111_11111111 yes, 00000000_00000000 no ?
+static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+ uint16_t q0, uint16_t q1, int bd) {
+ int16_t hev = 0;
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ hev |= (abs(p1 - p0) > thresh16) * -1;
+ hev |= (abs(q1 - q0) > thresh16) * -1;
+ return hev;
+}
+
+static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+ uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+ int bd) {
+ int16_t filter1, filter2;
+ // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+ // into -128 to +127 instead of 0 to 255.
+ int shift = bd - 8;
+ const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
+ const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
+ const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
+ const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
+ const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+
+ // Add outer taps if we have high edge variance.
+ int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
+
+ // Inner taps.
+ filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
+
+ // Save bottom 3 bits so that we round one side +4 and the other +3
+ // if it equals 4 we'll set it to adjust by -1 to account for the fact
+ // we'd round it by 3 the other way.
+ filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
+ filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
+
+ *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
+ *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
+
+ // Outer tap adjustments.
+ filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+ *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
+ *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
+}
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+ const uint16_t p3 = s[-4 * pitch];
+ const uint16_t p2 = s[-3 * pitch];
+ const uint16_t p1 = s[-2 * pitch];
+ const uint16_t p0 = s[-pitch];
+ const uint16_t q0 = s[0 * pitch];
+ const uint16_t q1 = s[1 * pitch];
+ const uint16_t q2 = s[2 * pitch];
+ const uint16_t q3 = s[3 * pitch];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s,
+ s + 1 * pitch, bd);
+ ++s;
+ }
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+ const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
+ s += pitch;
+ }
+}
+
+void vpx_highbd_lpf_vertical_4_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
+ uint16_t *op3, uint16_t *op2, uint16_t *op1,
+ uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+ uint16_t *oq2, uint16_t *oq3, int bd) {
+ if (flat && mask) {
+ const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+ // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+ *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+ *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+ *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+ *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+ *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+ *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+ } else {
+ highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+ }
+}
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8; ++i) {
+ const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+ p0 = s[-pitch];
+ const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+ q3 = s[3 * pitch];
+
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch,
+ s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+ s + 2 * pitch, s + 3 * pitch, bd);
+ ++s;
+ }
+}
+
+void vpx_highbd_lpf_horizontal_8_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
+ s + 2, s + 3, bd);
+ s += pitch;
+ }
+}
+
+void vpx_highbd_lpf_vertical_8_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
+ int8_t flat2, uint16_t *op7, uint16_t *op6,
+ uint16_t *op5, uint16_t *op4, uint16_t *op3,
+ uint16_t *op2, uint16_t *op1, uint16_t *op0,
+ uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
+ uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
+ uint16_t *oq6, uint16_t *oq7, int bd) {
+ if (flat2 && flat && mask) {
+ const uint16_t p7 = *op7;
+ const uint16_t p6 = *op6;
+ const uint16_t p5 = *op5;
+ const uint16_t p4 = *op4;
+ const uint16_t p3 = *op3;
+ const uint16_t p2 = *op2;
+ const uint16_t p1 = *op1;
+ const uint16_t p0 = *op0;
+ const uint16_t q0 = *oq0;
+ const uint16_t q1 = *oq1;
+ const uint16_t q2 = *oq2;
+ const uint16_t q3 = *oq3;
+ const uint16_t q4 = *oq4;
+ const uint16_t q5 = *oq5;
+ const uint16_t q6 = *oq6;
+ const uint16_t q7 = *oq7;
+
+ // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+ *op6 = ROUND_POWER_OF_TWO(
+ p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
+ *op5 = ROUND_POWER_OF_TWO(
+ p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
+ *op4 = ROUND_POWER_OF_TWO(
+ p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
+ *op3 = ROUND_POWER_OF_TWO(
+ p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
+ *op2 = ROUND_POWER_OF_TWO(
+ p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
+ 4);
+ *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+ q0 + q1 + q2 + q3 + q4 + q5,
+ 4);
+ *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
+ q1 + q2 + q3 + q4 + q5 + q6,
+ 4);
+ *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
+ q2 + q3 + q4 + q5 + q6 + q7,
+ 4);
+ *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
+ q3 + q4 + q5 + q6 + q7 * 2,
+ 4);
+ *oq2 = ROUND_POWER_OF_TWO(
+ p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
+ 4);
+ *oq3 = ROUND_POWER_OF_TWO(
+ p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+ *oq4 = ROUND_POWER_OF_TWO(
+ p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+ *oq5 = ROUND_POWER_OF_TWO(
+ p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+ *oq6 = ROUND_POWER_OF_TWO(
+ p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+ } else {
+ highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+ bd);
+ }
+}
+
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int count,
+ int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8 * count; ++i) {
+ const uint16_t p3 = s[-4 * pitch];
+ const uint16_t p2 = s[-3 * pitch];
+ const uint16_t p1 = s[-2 * pitch];
+ const uint16_t p0 = s[-pitch];
+ const uint16_t q0 = s[0 * pitch];
+ const uint16_t q1 = s[1 * pitch];
+ const uint16_t q2 = s[2 * pitch];
+ const uint16_t q3 = s[3 * pitch];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat2 = highbd_flat_mask5(
+ 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+ s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd);
+
+ highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+ s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+ s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+ s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch,
+ s + 6 * pitch, s + 7 * pitch, bd);
+ ++s;
+ }
+}
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
+}
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd);
+}
+
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int count,
+ int bd) {
+ int i;
+
+ for (i = 0; i < count; ++i) {
+ const uint16_t p3 = s[-4];
+ const uint16_t p2 = s[-3];
+ const uint16_t p1 = s[-2];
+ const uint16_t p0 = s[-1];
+ const uint16_t q0 = s[0];
+ const uint16_t q1 = s[1];
+ const uint16_t q2 = s[2];
+ const uint16_t q3 = s[3];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+ q0, s[4], s[5], s[6], s[7], bd);
+
+ highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
+ s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
+ s + 5, s + 6, s + 7, bd);
+ s += pitch;
+ }
+}
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd);
+}
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c
new file mode 100644
index 0000000000..97541411e4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise,
+ int blackclamp, int whiteclamp, int width,
+ int height, int32_t pitch) {
+ int i, j;
+ v16u8 pos0, pos1, ref0, ref1;
+ v16i8 black_clamp, white_clamp, both_clamp;
+
+ black_clamp = __msa_fill_b(blackclamp);
+ white_clamp = __msa_fill_b(whiteclamp);
+ both_clamp = black_clamp + white_clamp;
+ both_clamp = -both_clamp;
+
+ for (i = 0; i < height / 2; ++i) {
+ uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
+ const int8_t *ref0_ptr = noise + (rand() & 0xff);
+ uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
+ const int8_t *ref1_ptr = noise + (rand() & 0xff);
+ for (j = width / 16; j--;) {
+ pos0 = LD_UB(pos0_ptr);
+ ref0 = LD_UB(ref0_ptr);
+ pos1 = LD_UB(pos1_ptr);
+ ref1 = LD_UB(ref1_ptr);
+ pos0 = __msa_subsus_u_b(pos0, black_clamp);
+ pos1 = __msa_subsus_u_b(pos1, black_clamp);
+ pos0 = __msa_subsus_u_b(pos0, both_clamp);
+ pos1 = __msa_subsus_u_b(pos1, both_clamp);
+ pos0 = __msa_subsus_u_b(pos0, white_clamp);
+ pos1 = __msa_subsus_u_b(pos1, white_clamp);
+ pos0 += ref0;
+ ST_UB(pos0, pos0_ptr);
+ pos1 += ref1;
+ ST_UB(pos1, pos1_ptr);
+ pos0_ptr += 16;
+ pos1_ptr += 16;
+ ref0_ptr += 16;
+ ref1_ptr += 16;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c
new file mode 100644
index 0000000000..3fd18dec56
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c
@@ -0,0 +1,731 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+uint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
+ uint32_t sum_out;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
+ v4u32 sum = { 0 };
+
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
+ HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
+ ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
+ ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
+ sum0 += sum4;
+
+ sum = __msa_hadd_u_w(sum0, sum0);
+ sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
+ sum = __msa_hadd_u_w(sum0, sum0);
+ sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
+ sum_out = __msa_copy_u_w((v4i32)sum, 0);
+
+ return sum_out;
+}
+
+uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
+ uint32_t sum_out;
+ uint32_t src0, src1, src2, src3;
+ v16u8 vec = { 0 };
+ v8u16 sum0;
+ v4u32 sum1;
+ v2u64 sum2;
+
+ LW4(src, src_stride, src0, src1, src2, src3);
+ INSERT_W4_UB(src0, src1, src2, src3, vec);
+
+ sum0 = __msa_hadd_u_h(vec, vec);
+ sum1 = __msa_hadd_u_w(sum0, sum0);
+ sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
+ sum1 = __msa_hadd_u_w(sum0, sum0);
+ sum2 = __msa_hadd_u_d(sum1, sum1);
+ sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
+ sum_out = __msa_copy_u_w((v4i32)sum1, 0);
+
+ return sum_out;
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride,
+ int16_t *dst) {
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8);
+}
+
+void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride,
+ int16_t *dst) {
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+
+ LD_SH2(src, 8, src0, src8);
+ src += src_stride;
+ LD_SH2(src, 8, src1, src9);
+ src += src_stride;
+ LD_SH2(src, 8, src2, src10);
+ src += src_stride;
+ LD_SH2(src, 8, src3, src11);
+ src += src_stride;
+ LD_SH2(src, 8, src4, src12);
+ src += src_stride;
+ LD_SH2(src, 8, src5, src13);
+ src += src_stride;
+ LD_SH2(src, 8, src6, src14);
+ src += src_stride;
+ LD_SH2(src, 8, src7, src15);
+ src += src_stride;
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+ tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src11, src4, src5, src6, src7);
+ ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8);
+
+ BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+ src12, src13, src15, src14, src11, src10);
+ BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+ tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+ TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
+ src9, src10, src11, src12, src13, src14, src15);
+ BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+ tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+ BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+ src12, src13, src15, src14, src11, src10);
+ BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+ tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+ TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
+ res1, res2, res3, res4, res5, res6, res7);
+
+ LD_SH2(src, 8, src0, src8);
+ src += src_stride;
+ LD_SH2(src, 8, src1, src9);
+ src += src_stride;
+ LD_SH2(src, 8, src2, src10);
+ src += src_stride;
+ LD_SH2(src, 8, src3, src11);
+ src += src_stride;
+
+ ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8);
+
+ LD_SH2(src, 8, src4, src12);
+ src += src_stride;
+ LD_SH2(src, 8, src5, src13);
+ src += src_stride;
+ LD_SH2(src, 8, src6, src14);
+ src += src_stride;
+ LD_SH2(src, 8, src7, src15);
+ src += src_stride;
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+ tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8);
+
+ BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+ src12, src13, src15, src14, src11, src10);
+ BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+ tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+ TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
+ src9, src10, src11, src12, src13, src14, src15);
+ BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+ tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+ BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+ src12, src13, src15, src14, src11, src10);
+ BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+ tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+ TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
+ res1, res2, res3, res4, res5, res6, res7);
+ ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8);
+
+ LD_SH4(dst, 64, src0, src1, src2, src3);
+ LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+ SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+
+ ST_SH4(src0, src1, src2, src3, dst, 64);
+ ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+ dst += 16;
+
+ LD_SH4(dst, 64, src0, src1, src2, src3);
+ LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+ SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+
+ ST_SH4(src0, src1, src2, src3, dst, 64);
+ ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+ dst += 16;
+
+ LD_SH4(dst, 64, src0, src1, src2, src3);
+ LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+ SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+
+ ST_SH4(src0, src1, src2, src3, dst, 64);
+ ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+ dst += 16;
+
+ LD_SH4(dst, 64, src0, src1, src2, src3);
+ LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+ SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+
+ ST_SH4(src0, src1, src2, src3, dst, 64);
+ ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+}
+
+int vpx_satd_msa(const int16_t *data, int length) {
+ int i, satd;
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8i16 src8, src9, src10, src11, src12, src13, src14, src15;
+ v8i16 zero = { 0 };
+ v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h;
+ v4u32 tmp0_w = { 0 };
+
+ if (16 == length) {
+ LD_SH2(data, 8, src0, src1);
+ tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+ tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ satd = HADD_UW_U32(tmp0_w);
+ } else if (64 == length) {
+ LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+ tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+ tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+ tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+ tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+ tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+ tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+ tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+ tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+ tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+ tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+ tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+ tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+ satd = HADD_UW_U32(tmp0_w);
+ } else if (256 == length) {
+ for (i = 0; i < 2; ++i) {
+ LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+ data += 8 * 8;
+ LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
+ data += 8 * 8;
+
+ tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+ tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+ tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+ tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+ tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+ tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+ tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+ tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+ tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+ tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+ tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+ tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+ tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+ tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
+ tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
+ tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
+ tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
+ tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
+ tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
+ tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
+
+ tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+ tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+ tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+ tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+ tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+ tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+ }
+
+ satd = HADD_UW_U32(tmp0_w);
+ } else if (1024 == length) {
+ for (i = 0; i < 8; ++i) {
+ LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+ data += 8 * 8;
+ LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
+ data += 8 * 8;
+
+ tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+ tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+ tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+ tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+ tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+ tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+ tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+ tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+ tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+ tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+ tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+ tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+ tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+ tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
+ tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
+ tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
+ tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
+ tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
+ tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
+ tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
+
+ tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+ tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+ tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+ tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+ tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+ tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+ }
+
+ satd = HADD_UW_U32(tmp0_w);
+ } else {
+ satd = 0;
+
+ for (i = 0; i < length; ++i) {
+ satd += abs(data[i]);
+ }
+ }
+
+ return satd;
+}
+#endif // !CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
+ const int ref_stride, const int height) {
+ int i;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ v8i16 hbuf_r = { 0 };
+ v8i16 hbuf_l = { 0 };
+ v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l;
+ v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l;
+
+ if (16 == height) {
+ for (i = 2; i--;) {
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ }
+
+ SRA_2V(hbuf_r, hbuf_l, 3);
+ ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+ } else if (32 == height) {
+ for (i = 2; i--;) {
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ }
+
+ SRA_2V(hbuf_r, hbuf_l, 4);
+ ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+ } else if (64 == height) {
+ for (i = 4; i--;) {
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ }
+
+ SRA_2V(hbuf_r, hbuf_l, 5);
+ ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+ } else {
+ const int norm_factor = height >> 1;
+ int cnt;
+
+ for (cnt = 0; cnt < 16; cnt++) {
+ hbuf[cnt] = 0;
+ }
+
+ for (i = 0; i < height; ++i) {
+ for (cnt = 0; cnt < 16; cnt++) {
+ hbuf[cnt] += ref[cnt];
+ }
+
+ ref += ref_stride;
+ }
+
+ for (cnt = 0; cnt < 16; cnt++) {
+ hbuf[cnt] /= norm_factor;
+ }
+ }
+}
+
+int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) {
+ int16_t sum;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8u16 ref0_h;
+
+ if (16 == width) {
+ ref0 = LD_UB(ref);
+ ref0_h = __msa_hadd_u_h(ref0, ref0);
+ sum = HADD_UH_U32(ref0_h);
+ } else if (32 == width) {
+ LD_UB2(ref, 16, ref0, ref1);
+ ref0_h = __msa_hadd_u_h(ref0, ref0);
+ ref0_h += __msa_hadd_u_h(ref1, ref1);
+ sum = HADD_UH_U32(ref0_h);
+ } else if (64 == width) {
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref0_h = __msa_hadd_u_h(ref0, ref0);
+ ref0_h += __msa_hadd_u_h(ref1, ref1);
+ ref0_h += __msa_hadd_u_h(ref2, ref2);
+ ref0_h += __msa_hadd_u_h(ref3, ref3);
+ sum = HADD_UH_U32(ref0_h);
+ } else {
+ int idx;
+
+ sum = 0;
+ for (idx = 0; idx < width; ++idx) {
+ sum += ref[idx];
+ }
+ }
+
+ return sum;
+}
+
+int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) {
+ int sse, mean, var;
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2;
+ v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m;
+ v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m;
+ v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m;
+ v4i32 res_l7_m, mean_v;
+ v2i64 sse_v;
+
+ if (2 == bwl) {
+ LD_SH2(src, 8, src0, src1);
+ LD_SH2(ref, 8, ref0, ref1);
+
+ ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+ ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+ HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+ HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+ sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+ sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+ DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+ mean_v = res_l0_m + res_l1_m;
+ mean_v += res_l2_m + res_l3_m;
+
+ sse_v += __msa_splati_d(sse_v, 1);
+ sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+ mean = HADD_SW_S32(mean_v);
+ } else if (3 == bwl) {
+ LD_SH4(src, 8, src0, src1, src2, src3);
+ LD_SH4(ref, 8, ref0, ref1, ref2, ref3);
+
+ ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+ ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+ ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
+ ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
+ HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+ HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+ HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+ HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+ sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+ sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+ DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+ mean_v = res_l0_m + res_l1_m;
+ mean_v += res_l2_m + res_l3_m;
+ mean_v += res_l4_m + res_l5_m;
+ mean_v += res_l6_m + res_l7_m;
+
+ sse_v += __msa_splati_d(sse_v, 1);
+ sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+ mean = HADD_SW_S32(mean_v);
+ } else if (4 == bwl) {
+ LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+
+ ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+ ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+ ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
+ ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
+ HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+ HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+ HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+ HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+ sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+ sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+ DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+ mean_v = res_l0_m + res_l1_m;
+ mean_v += res_l2_m + res_l3_m;
+ mean_v += res_l4_m + res_l5_m;
+ mean_v += res_l6_m + res_l7_m;
+
+ ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m);
+ ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m);
+ ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m);
+ ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m);
+ HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+ HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+ HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+ HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+ DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+ mean_v += res_l0_m + res_l1_m;
+ mean_v += res_l2_m + res_l3_m;
+ mean_v += res_l4_m + res_l5_m;
+ mean_v += res_l6_m + res_l7_m;
+
+ sse_v += __msa_splati_d(sse_v, 1);
+ sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+ mean = HADD_SW_S32(mean_v);
+ } else {
+ int i;
+ const int width = 4 << bwl;
+
+ sse = 0;
+ mean = 0;
+
+ for (i = 0; i < width; ++i) {
+ const int diff = ref[i] - src[i];
+
+ mean += diff;
+ sse += diff * diff;
+ }
+ }
+
+ var = sse - ((mean * mean) >> (bwl + 2));
+
+ return var;
+}
+
+void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7;
+ v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1;
+
+ LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
+ LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7);
+ PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3);
+ PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3);
+
+ diff0 = __msa_asub_u_b(s0, d0);
+ diff1 = __msa_asub_u_b(s1, d1);
+ diff2 = __msa_asub_u_b(s2, d2);
+ diff3 = __msa_asub_u_b(s3, d3);
+
+ min0 = __msa_min_u_b(diff0, diff1);
+ min1 = __msa_min_u_b(diff2, diff3);
+ min0 = __msa_min_u_b(min0, min1);
+
+ max0 = __msa_max_u_b(diff0, diff1);
+ max1 = __msa_max_u_b(diff2, diff3);
+ max0 = __msa_max_u_b(max0, max1);
+
+ min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8);
+ min0 = __msa_min_u_b(min0, min1);
+ max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8);
+ max0 = __msa_max_u_b(max0, max1);
+
+ min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4);
+ min0 = __msa_min_u_b(min0, min1);
+ max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4);
+ max0 = __msa_max_u_b(max0, max1);
+
+ min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2);
+ min0 = __msa_min_u_b(min0, min1);
+ max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2);
+ max0 = __msa_max_u_b(max0, max1);
+
+ min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1);
+ min0 = __msa_min_u_b(min0, min1);
+ max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1);
+ max0 = __msa_max_u_b(max0, max1);
+
+ *min = min0[0];
+ *max = max0[0];
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.c
new file mode 100644
index 0000000000..b22f084a02
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+uint8_t vpx_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
+uint8_t *vpx_ff_cropTbl;
+
+void vpx_dsputil_static_init(void) {
+ int i;
+
+ for (i = 0; i < 256; i++) vpx_ff_cropTbl_a[i + CROP_WIDTH] = i;
+
+ for (i = 0; i < CROP_WIDTH; i++) {
+ vpx_ff_cropTbl_a[i] = 0;
+ vpx_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
+ }
+
+ vpx_ff_cropTbl = &vpx_ff_cropTbl_a[CROP_WIDTH];
+}
+
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h
new file mode 100644
index 0000000000..87a5bbab56
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+
+extern uint8_t *vpx_ff_cropTbl; // From "vpx_dsp/mips/intrapred4_dspr2.c"
+
+static INLINE void prefetch_load(const unsigned char *src) {
+ __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src));
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store(unsigned char *dst) {
+ __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst));
+}
+
+static INLINE void prefetch_load_streamed(const unsigned char *src) {
+ __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src));
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store_streamed(unsigned char *dst) {
+ __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst));
+}
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
new file mode 100644
index 0000000000..18e7d5375d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y, int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+ prefetch_store(dst + dst_stride + 32);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16_t *const filter_y = filter[y0_q4];
+ uint32_t pos = 38;
+
+ assert(y_step_q4 == 16);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y,
+ w, h);
+ break;
+ case 64:
+ prefetch_store(dst + 32);
+ convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
+ h);
+ break;
+ default:
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
new file mode 100644
index 0000000000..7dcb662d7f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3;
+ uint32_t tn1, tn2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
+ "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
+ "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
+
+ /* clamp */
+ "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
+ "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
+ "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
+
+ "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
+ "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
+
+ "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
+ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t p1, p2, p3, p4, n1;
+ uint32_t st0, st1;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+ "lbu %[Temp2], 0(%[dst]) \n\t"
+ "lbu %[tp4], 2(%[dst]) \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+ "addqh_r.w %[tp4], %[tp4], %[st1] \n\t"
+ "sb %[Temp2], 0(%[dst]) \n\t"
+ "sb %[tp4], 2(%[dst]) \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "lbu %[Temp2], 4(%[dst]) \n\t"
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[Temp2], 4(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp1], 6(%[dst]) \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ "lbu %[tp2], 1(%[dst]) \n\t"
+ "lbu %[tp3], 3(%[dst]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp4], 5(%[dst]) \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp2], 1(%[dst]) \n\t"
+ "sb %[tp1], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 7(%[dst]) \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "addqh_r.w %[tp3], %[tp3], %[p4] \n\t"
+
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[tp4], %[tp4], %[p2] \n\t"
+
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[p1] \n\t"
+
+ /* store bytes */
+ "sb %[tp3], 3(%[dst]) \n\t"
+ "sb %[tp4], 5(%[dst]) \n\t"
+ "sb %[tp1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+ prefetch_store(dst_ptr + dst_stride);
+ prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ uint32_t pos = 38;
+
+ assert(x_step_q4 == 16);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ case 8:
+ convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ case 16:
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 1);
+ break;
+ case 32:
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 2);
+ break;
+ case 64:
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ default:
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_dspr2.c
new file mode 100644
index 0000000000..e355ba3a06
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_dspr2.c
@@ -0,0 +1,1029 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_transposed_dspr2(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint8_t *dst_ptr;
+ int32_t Temp1, Temp2;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ dst_ptr = dst;
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp2](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[tp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [src] "r"(src), [dst_stride] "r"(dst_stride));
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_bi_horiz_8_transposed_dspr2(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint8_t *dst_ptr;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4;
+ uint8_t *odd_dst;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+
+ dst_ptr = dst;
+ odd_dst = (dst_ptr + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "extp %[p3], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[Temp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[Temp1], %[p3](%[cm]) "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[Temp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[tp3], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp3], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p2], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p1], 0(%[odd_dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
+ [odd_dst] "+r"(odd_dst)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_bi_horiz_16_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += 1;
+ }
+}
+
+static void convolve_bi_horiz_64_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += 1;
+ }
+}
+
+void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter, int w, int h) {
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int sum = 0;
+
+ sum += src[x] * filter[3];
+ sum += src[x + 1] * filter[4];
+
+ dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter, int w,
+ int h) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+
+ switch (w) {
+ case 4:
+ convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
+ filter, h);
+ break;
+ case 8:
+ convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
+ filter, h);
+ break;
+ case 16:
+ case 32:
+ convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
+ filter, h, (w / 16));
+ break;
+ case 64:
+ prefetch_load(src + 32);
+ convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
+ filter, h);
+ break;
+ default:
+ convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
+ h);
+ break;
+ }
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
new file mode 100644
index 0000000000..9e65a8f50f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
@@ -0,0 +1,681 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p1], %[Temp2](%[cm]) \n\t"
+ "lbux %[p2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst]) \n\t"
+ "sb %[p1], 1(%[dst]) \n\t"
+ "sb %[tp2], 2(%[dst]) \n\t"
+ "sb %[p2], 3(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4;
+ uint32_t st0, st1;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st0], 0(%[dst]) \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 2(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[st0], 4(%[dst]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[st1], 1(%[dst]) \n\t"
+ "sb %[st0], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 3(%[dst]) \n\t"
+ "sb %[p2], 5(%[dst]) \n\t"
+ "sb %[p1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+ prefetch_store(dst_ptr + dst_stride);
+ prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+ [dst] "r"(dst), [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ uint32_t pos = 38;
+
+ assert(x_step_q4 == 16);
+
+ prefetch_load((const uint8_t *)filter_x);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ case 8:
+ convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ case 16:
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 1);
+ break;
+ case 32:
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 2);
+ break;
+ case 64:
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ default:
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
new file mode 100644
index 0000000000..a3e967b405
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+ [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16_t *const filter_y = filter[y0_q4];
+ uint32_t pos = 38;
+
+ assert(y_step_q4 == 16);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
+ h);
+ break;
+ case 64:
+ prefetch_store(dst + 32);
+ convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
+ break;
+ default:
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
new file mode 100644
index 0000000000..cc458c8618
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+ prefetch_store(dst + dst_stride + 32);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16_t *const filter_y = filter[y0_q4];
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ if (vpx_get_filter_taps(filter_y) == 2) {
+ vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ } else {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
+ h);
+ break;
+ case 64:
+ prefetch_store(dst + 32);
+ convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
+ h);
+ break;
+ default:
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
+
+void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ /* Fixed size intermediate buffer places limits on parameters. */
+ DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
+ int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+
+ if (intermediate_height < h) intermediate_height = h;
+
+ vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height);
+
+ vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ int x, y;
+ uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ /* 1 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+
+ : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 8:
+ /* 2 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 16:
+ /* 4 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 8(%[src]) \n\t"
+ "ulw %[tp2], 8(%[dst]) \n\t"
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 12(%[src]) \n\t"
+ "ulw %[tp4], 12(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 32:
+ /* 8 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 8(%[src]) \n\t"
+ "ulw %[tp2], 8(%[dst]) \n\t"
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 12(%[src]) \n\t"
+ "ulw %[tp4], 12(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 16(%[src]) \n\t"
+ "ulw %[tp2], 16(%[dst]) \n\t"
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 20(%[src]) \n\t"
+ "ulw %[tp4], 20(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 24(%[src]) \n\t"
+ "ulw %[tp2], 24(%[dst]) \n\t"
+ "sw %[tn1], 16(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 20(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 28(%[src]) \n\t"
+ "ulw %[tp4], 28(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 24(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 28(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 64:
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ /* 16 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_load(src + src_stride + 64);
+ prefetch_store(dst + dst_stride);
+ prefetch_store(dst + dst_stride + 32);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 8(%[src]) \n\t"
+ "ulw %[tp2], 8(%[dst]) \n\t"
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 12(%[src]) \n\t"
+ "ulw %[tp4], 12(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 16(%[src]) \n\t"
+ "ulw %[tp2], 16(%[dst]) \n\t"
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 20(%[src]) \n\t"
+ "ulw %[tp4], 20(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 24(%[src]) \n\t"
+ "ulw %[tp2], 24(%[dst]) \n\t"
+ "sw %[tn1], 16(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 20(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 28(%[src]) \n\t"
+ "ulw %[tp4], 28(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 32(%[src]) \n\t"
+ "ulw %[tp2], 32(%[dst]) \n\t"
+ "sw %[tn1], 24(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 28(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 36(%[src]) \n\t"
+ "ulw %[tp4], 36(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 40(%[src]) \n\t"
+ "ulw %[tp2], 40(%[dst]) \n\t"
+ "sw %[tn1], 32(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 36(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 44(%[src]) \n\t"
+ "ulw %[tp4], 44(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 48(%[src]) \n\t"
+ "ulw %[tp2], 48(%[dst]) \n\t"
+ "sw %[tn1], 40(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 44(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 52(%[src]) \n\t"
+ "ulw %[tp4], 52(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 56(%[src]) \n\t"
+ "ulw %[tp2], 56(%[dst]) \n\t"
+ "sw %[tn1], 48(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 52(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 60(%[src]) \n\t"
+ "ulw %[tp4], 60(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 56(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 60(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ default:
+ for (y = h; y > 0; --y) {
+ for (x = 0; x < w; ++x) {
+ dst[x] = (dst[x] + src[x] + 1) >> 1;
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
new file mode 100644
index 0000000000..7a9aa49d8a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -0,0 +1,998 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4;
+ uint32_t n1, n2, n3, n4;
+ uint32_t tn1, tn2;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "preceu.ph.qbl %[n4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[n1], %[tn1] \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
+ "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
+ "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
+
+ /* clamp */
+ "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
+ "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */
+ "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
+
+ "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
+ "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
+
+ "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */
+ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
+ [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4, n1;
+ uint32_t tn1, tn2, tn3;
+ uint32_t st0, st1;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+ "lbu %[Temp2], 0(%[dst]) \n\t"
+ "lbu %[tn3], 2(%[dst]) \n\t"
+
+ /* even 2. pixel */
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "ulw %[tn1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[tn1] \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+ "addqh_r.w %[tn3], %[tn3], %[st1] \n\t"
+ "sb %[Temp2], 0(%[dst]) \n\t"
+ "sb %[tn3], 2(%[dst]) \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "balign %[tn3], %[tn1], 3 \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "lbu %[Temp2], 4(%[dst]) \n\t"
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[Temp2], 4(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tn2] \n\t"
+ "preceu.ph.qbl %[p4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp1], 6(%[dst]) \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn1] \n\t"
+ "preceu.ph.qbl %[n1], %[tn1] \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ "lbu %[tp2], 1(%[dst]) \n\t"
+ "lbu %[tn2], 3(%[dst]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "preceu.ph.qbr %[p2], %[tn3] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
+ "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tn3], 5(%[dst]) \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp2], 1(%[dst]) \n\t"
+ "sb %[tp1], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbu %[tn1], 7(%[dst]) \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "addqh_r.w %[tn2], %[tn2], %[p4] \n\t"
+
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[tn3], %[tn3], %[p2] \n\t"
+
+ "lbux %[n1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[tn1], %[tn1], %[n1] \n\t"
+
+ /* store bytes */
+ "sb %[tn2], 3(%[dst]) \n\t"
+ "sb %[tn3], 5(%[dst]) \n\t"
+ "sb %[tn1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
+ [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+ prefetch_store(dst_ptr + dst_stride);
+ prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+ [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+ [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ if (vpx_get_filter_taps(filter_x) == 2) {
+ vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ } else {
+ uint32_t pos = 38;
+
+ src -= 3;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ case 8:
+ convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ case 16:
+ convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 1);
+ break;
+ case 32:
+ convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h, 2);
+ break;
+ case 64:
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
+ h);
+ break;
+ default:
+ vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c
new file mode 100644
index 0000000000..1e7052f6c5
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c
@@ -0,0 +1,1602 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint8_t *dst_ptr;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4;
+ uint32_t tn1, tn2;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ dst_ptr = dst;
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tn2] \n\t"
+ "preceu.ph.qbl %[p4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[p2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[tn1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[tp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
+ [dst_stride] "r"(dst_stride));
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint8_t *dst_ptr;
+ uint32_t vector4a = 64;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4, n1;
+ uint8_t *odd_dst;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+
+ dst_ptr = dst;
+ odd_dst = (dst_ptr + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp2], 0(%[src]) \n\t"
+ "ulw %[tp1], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp1] \n\t"
+ "preceu.ph.qbl %[p4], %[tp1] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "preceu.ph.qbr %[p1], %[tp3] \n\t"
+ "preceu.ph.qbl %[n1], %[tp3] \n\t"
+ "ulw %[tp2], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
+ "lbux %[tp3], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
+ "extp %[p3], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[Temp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "sb %[tp3], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "ulw %[tp1], 1(%[src]) \n\t"
+ "ulw %[tp3], 5(%[src]) \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[tp2], %[p3](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[tp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "ulw %[tp2], 9(%[src]) \n\t"
+
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n1], %[tp2] \n\t"
+ "ulw %[Temp1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[tp3], %[Temp2](%[cm]) \n\t"
+ "preceu.ph.qbr %[p2], %[Temp1] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp3], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[n1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p2], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[n1], 0(%[odd_dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
+ [dst_pitch_2] "r"(dst_pitch_2));
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_horiz_16_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter12] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 16(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter12] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 17(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter12] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
+ [dst_pitch_2] "r"(dst_pitch_2));
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+
+ dst_ptr += 1;
+ }
+}
+
+static void convolve_horiz_64_transposed_dspr2(
+ const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+ int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 4(%[src]) "
+ "\n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 1 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 2 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "ulw %[qload2], 8(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] "
+ "\n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 3 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "ulw %[qload1], 12(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] "
+ "\n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] "
+ "\n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 4 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ " \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter12] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] "
+ "\n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] "
+ "\n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 5 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 16(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] "
+ "\n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] "
+ "\n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* even 6 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter12] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] "
+ "\n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] "
+ "\n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* even 7 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 20(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* even 8 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 1 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) "
+ "\n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) "
+ "\n\t"
+ "ulw %[qload2], 5(%[src]) "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 2 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[qload1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st1], 0(%[dst]) "
+ "\n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 9(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] "
+ "\n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] "
+ "\n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 3 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[qload2] "
+ "\n\t"
+ "preceu.ph.qbl %[p5], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[dst]) "
+ "\n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] "
+ "\n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] "
+ "\n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 4 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p2], %[qload1] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] "
+ "\n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] "
+ "\n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 5 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbl %[p3], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload2], 17(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] "
+ "\n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] "
+ "\n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 "
+ "\n\t" /* odd 6 */
+ "mthi $zero, $ac2 "
+ "\n\t"
+ "preceu.ph.qbr %[p4], %[qload2] "
+ "\n\t"
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] "
+ "\n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] "
+ "\n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 "
+ "\n\t" /* odd 7 */
+ "mthi $zero, $ac3 "
+ "\n\t"
+ "preceu.ph.qbl %[p1], %[qload2] "
+ "\n\t"
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "ulw %[qload1], 21(%[src]) "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] "
+ "\n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] "
+ "\n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 "
+ "\n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 "
+ "\n\t" /* odd 8 */
+ "mthi $zero, $ac1 "
+ "\n\t"
+ "preceu.ph.qbr %[p5], %[qload1] "
+ "\n\t"
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter12] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] "
+ "\n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] "
+ "\n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 "
+ "\n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] "
+ "\n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] "
+ "\n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 "
+ "\n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) "
+ "\n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) "
+ "\n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) "
+ "\n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) "
+ "\n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st3], 0(%[odd_dst]) "
+ "\n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
+ "\n\t"
+
+ "sb %[st1], 0(%[odd_dst]) "
+ "\n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+ [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
+ [dst_pitch_2] "r"(dst_pitch_2));
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+
+ dst_ptr += 1;
+ }
+}
+
+void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter, int w, int h) {
+ int x, y, k;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int sum = 0;
+
+ for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k];
+
+ dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ dst[x * dst_stride] = src[x];
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ const int16_t *const filter_y = filter[y0_q4];
+ DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
+ int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+ uint32_t pos = 38;
+
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+ (void)x_step_q4;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ if (intermediate_height < h) intermediate_height = h;
+
+ /* copy the src to dst */
+ if (filter_x[3] == 0x80) {
+ copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
+ intermediate_height, w, intermediate_height);
+ } else if (vpx_get_filter_taps(filter_x) == 2) {
+ vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
+ intermediate_height, filter_x, w, intermediate_height);
+ } else {
+ src -= (src_stride * 3 + 3);
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+
+ switch (w) {
+ case 4:
+ convolve_horiz_4_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height);
+ break;
+ case 8:
+ convolve_horiz_8_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height);
+ break;
+ case 16:
+ case 32:
+ convolve_horiz_16_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height, (w / 16));
+ break;
+ case 64:
+ prefetch_load(src + 32);
+ convolve_horiz_64_transposed_dspr2(src, src_stride, temp,
+ intermediate_height, filter_x,
+ intermediate_height);
+ break;
+ default:
+ convolve_horiz_transposed(src, src_stride, temp, intermediate_height,
+ filter_x, w, intermediate_height);
+ break;
+ }
+ }
+
+ /* copy the src to dst */
+ if (filter_y[3] == 0x80) {
+ copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
+ } else if (vpx_get_filter_taps(filter_y) == 2) {
+ vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
+ filter_y, h, w);
+ } else {
+ switch (h) {
+ case 4:
+ convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w);
+ break;
+ case 8:
+ convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w);
+ break;
+ case 16:
+ case 32:
+ convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w, (h / 16));
+ break;
+ case 64:
+ convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst,
+ dst_stride, filter_y, w);
+ break;
+ default:
+ convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride,
+ filter_y, h, w);
+ break;
+ }
+ }
+}
+
+void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ int x, y;
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4: {
+ uint32_t tp1;
+
+ /* 1 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], (%[src]) \n\t"
+ "sw %[tp1], (%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ case 8: {
+ uint32_t tp1, tp2;
+
+ /* 2 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ case 16: {
+ uint32_t tp1, tp2, tp3, tp4;
+
+ /* 4 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ case 32: {
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t tp5, tp6, tp7, tp8;
+
+ /* 8 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+ "ulw %[tp5], 16(%[src]) \n\t"
+ "ulw %[tp6], 20(%[src]) \n\t"
+ "ulw %[tp7], 24(%[src]) \n\t"
+ "ulw %[tp8], 28(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+ "sw %[tp5], 16(%[dst]) \n\t" /* store */
+ "sw %[tp6], 20(%[dst]) \n\t" /* store */
+ "sw %[tp7], 24(%[dst]) \n\t" /* store */
+ "sw %[tp8], 28(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+ [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ case 64: {
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t tp5, tp6, tp7, tp8;
+
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ /* 16 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_load(src + src_stride + 64);
+ prefetch_store(dst + dst_stride);
+ prefetch_store(dst + dst_stride + 32);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+ "ulw %[tp5], 16(%[src]) \n\t"
+ "ulw %[tp6], 20(%[src]) \n\t"
+ "ulw %[tp7], 24(%[src]) \n\t"
+ "ulw %[tp8], 28(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+ "sw %[tp5], 16(%[dst]) \n\t" /* store */
+ "sw %[tp6], 20(%[dst]) \n\t" /* store */
+ "sw %[tp7], 24(%[dst]) \n\t" /* store */
+ "sw %[tp8], 28(%[dst]) \n\t" /* store */
+
+ "ulw %[tp1], 32(%[src]) \n\t"
+ "ulw %[tp2], 36(%[src]) \n\t"
+ "ulw %[tp3], 40(%[src]) \n\t"
+ "ulw %[tp4], 44(%[src]) \n\t"
+ "ulw %[tp5], 48(%[src]) \n\t"
+ "ulw %[tp6], 52(%[src]) \n\t"
+ "ulw %[tp7], 56(%[src]) \n\t"
+ "ulw %[tp8], 60(%[src]) \n\t"
+
+ "sw %[tp1], 32(%[dst]) \n\t" /* store */
+ "sw %[tp2], 36(%[dst]) \n\t" /* store */
+ "sw %[tp3], 40(%[dst]) \n\t" /* store */
+ "sw %[tp4], 44(%[dst]) \n\t" /* store */
+ "sw %[tp5], 48(%[dst]) \n\t" /* store */
+ "sw %[tp6], 52(%[dst]) \n\t" /* store */
+ "sw %[tp7], 56(%[dst]) \n\t" /* store */
+ "sw %[tp8], 60(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+ [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ default:
+ for (y = h; y--;) {
+ for (x = 0; x < w; ++x) {
+ dst[x] = src[x];
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
new file mode 100644
index 0000000000..09d6f36e56
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -0,0 +1,878 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4;
+ uint32_t n1, n2, n3, n4;
+ uint32_t tn1, tn2;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "preceu.ph.qbl %[n4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[n1], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[n2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst]) \n\t"
+ "sb %[tn1], 1(%[dst]) \n\t"
+ "sb %[tp2], 2(%[dst]) \n\t"
+ "sb %[n2], 3(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
+ [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4, n1;
+ uint32_t tn1, tn2, tn3;
+ uint32_t st0, st1;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "ulw %[tn1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[tn1] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st0], 0(%[dst]) \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+
+ "balign %[tn3], %[tn1], 3 \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 2(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tn2] \n\t"
+ "preceu.ph.qbl %[p4], %[tn2] \n\t"
+ "sb %[st0], 4(%[dst]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn1] \n\t"
+ "preceu.ph.qbl %[n1], %[tn1] \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "preceu.ph.qbr %[p2], %[tn3] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[st1], 1(%[dst]) \n\t"
+ "sb %[st0], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[n1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 3(%[dst]) \n\t"
+ "sb %[p2], 5(%[dst]) \n\t"
+ "sb %[n1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
+ [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ const int16_t *filter_x0, int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src_ptr + src_stride);
+ prefetch_load(src_ptr + src_stride + 32);
+ prefetch_load(src_ptr + src_stride + 64);
+ prefetch_store(dst_ptr + dst_stride);
+ prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__(
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+ [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+ [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+ : [filter12] "r"(filter12), [filter34] "r"(filter34),
+ [filter56] "r"(filter56), [filter78] "r"(filter78),
+ [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+ [src] "r"(src));
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ if (vpx_get_filter_taps(filter_x) == 2) {
+ vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ } else {
+ uint32_t pos = 38;
+
+ prefetch_load((const uint8_t *)filter_x);
+ src -= 3;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ case 8:
+ convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ case 16:
+ convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 1);
+ break;
+ case 32:
+ convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h, 2);
+ break;
+ case 64:
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filter_x, (int32_t)h);
+ break;
+ default:
+ vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
new file mode 100644
index 0000000000..fd977b5336
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int16_t *filter_y, int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vpx_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ prefetch_store(dst + dst_stride);
+ prefetch_store(dst + dst_stride + 32);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__(
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+ [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+ [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+ [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+ [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_y = filter[y0_q4];
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ if (vpx_get_filter_taps(filter_y) == 2) {
+ vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ } else {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
+ break;
+ case 64:
+ prefetch_store(dst + 32);
+ convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
+ break;
+ default:
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
+
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
new file mode 100644
index 0000000000..14b65bc650
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h);
+
+void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h);
+
+void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h);
+
+void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter, int w,
+ int h);
+
+void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h);
+
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c
new file mode 100644
index 0000000000..4e93ff594d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c
@@ -0,0 +1,742 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+extern const int16_t vpx_rv[];
+
+#define VPX_TRANSPOSE8x16_UB_UB( \
+ in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, \
+ out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15) \
+ { \
+ v8i16 temp0, temp1, temp2, temp3, temp4; \
+ v8i16 temp5, temp6, temp7, temp8, temp9; \
+ \
+ ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
+ temp3); \
+ ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
+ ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
+ ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
+ temp3); \
+ ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_UB(temp5, temp4, out8, out10); \
+ ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_UB(temp5, temp4, out12, out14); \
+ out0 = (v16u8)temp6; \
+ out2 = (v16u8)temp7; \
+ out4 = (v16u8)temp8; \
+ out6 = (v16u8)temp9; \
+ out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
+ out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
+ out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
+ out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
+ out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
+ out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
+ out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
+ }
+
+#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \
+ ref, out) \
+ { \
+ v16u8 temp0, temp1; \
+ \
+ temp1 = __msa_aver_u_b(above2_in, above1_in); \
+ temp0 = __msa_aver_u_b(below2_in, below1_in); \
+ temp1 = __msa_aver_u_b(temp1, temp0); \
+ out = __msa_aver_u_b(src_in, temp1); \
+ temp0 = __msa_asub_u_b(src_in, above2_in); \
+ temp1 = __msa_asub_u_b(src_in, above1_in); \
+ temp0 = (temp0 < ref); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ temp1 = __msa_asub_u_b(src_in, below1_in); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ temp1 = __msa_asub_u_b(src_in, below2_in); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ out = __msa_bmz_v(out, src_in, temp0); \
+ }
+
+#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
+ in10, in11, in12, in13, in14, in15) \
+ { \
+ v8i16 temp0, temp1, temp2, temp3, temp4; \
+ v8i16 temp5, temp6, temp7, temp8, temp9; \
+ \
+ ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
+ ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
+ ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
+ ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
+ ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
+ ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
+ ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
+ ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
+ ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
+ ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
+ ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
+ ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
+ ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
+ in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
+ in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
+ ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
+ in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
+ in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
+ ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, temp2, temp3, \
+ temp4, temp5); \
+ ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, temp6, \
+ temp7, temp8, temp9); \
+ ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
+ in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
+ in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
+ ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
+ in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
+ in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
+ }
+
+#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, \
+ in9, in10, in11) \
+ { \
+ v8i16 temp0, temp1, temp2, temp3; \
+ v8i16 temp4, temp5, temp6, temp7; \
+ \
+ ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
+ ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
+ ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
+ ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
+ ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
+ temp4 = __msa_ilvr_h(temp5, temp4); \
+ ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
+ temp5 = __msa_ilvr_h(temp7, temp6); \
+ ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
+ in0 = (v16u8)temp0; \
+ in2 = (v16u8)temp1; \
+ in4 = (v16u8)temp2; \
+ in6 = (v16u8)temp3; \
+ in8 = (v16u8)temp6; \
+ in10 = (v16u8)temp7; \
+ in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
+ in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
+ in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
+ in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
+ in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
+ in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
+ }
+
+static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+ int32_t src_stride,
+ int32_t dst_stride, int32_t cols,
+ uint8_t *f) {
+ uint8_t *p_src = src_ptr;
+ uint8_t *p_dst = dst_ptr;
+ uint8_t *f_orig = f;
+ uint8_t *p_dst_st = dst_ptr;
+ uint16_t col;
+ uint64_t out0, out1, out2, out3;
+ v16u8 above2, above1, below2, below1, src, ref, ref_temp;
+ v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
+ v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
+
+ for (col = (cols / 16); col--;) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+ p_dst, dst_stride);
+
+ p_dst += 16;
+ p_src += 16;
+ f += 16;
+ }
+
+ if (0 != (cols / 16)) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ out0 = __msa_copy_u_d((v2i64)inter0, 0);
+ out1 = __msa_copy_u_d((v2i64)inter1, 0);
+ out2 = __msa_copy_u_d((v2i64)inter2, 0);
+ out3 = __msa_copy_u_d((v2i64)inter3, 0);
+ SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64)inter4, 0);
+ out1 = __msa_copy_u_d((v2i64)inter5, 0);
+ out2 = __msa_copy_u_d((v2i64)inter6, 0);
+ out3 = __msa_copy_u_d((v2i64)inter7, 0);
+ SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+ }
+
+ f = f_orig;
+ p_dst = dst_ptr - 2;
+ LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
+ inter6, inter7);
+
+ for (col = 0; col < (cols / 8); ++col) {
+ ref = LD_UB(f);
+ f += 8;
+ VPX_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5,
+ inter6, inter7, inter8, inter9, inter10, inter11);
+ if (0 == col) {
+ above2 = inter2;
+ above1 = inter2;
+ } else {
+ above2 = inter0;
+ above1 = inter1;
+ }
+ src = inter2;
+ below1 = inter3;
+ below2 = inter4;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
+ above2 = inter5;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
+ above1 = inter6;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
+ src = inter7;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
+ below1 = inter8;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
+ below2 = inter9;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
+ if (col == (cols / 8 - 1)) {
+ above2 = inter9;
+ } else {
+ above2 = inter10;
+ }
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
+ if (col == (cols / 8 - 1)) {
+ above1 = inter9;
+ } else {
+ above1 = inter11;
+ }
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
+ TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
+ inter9, inter2, inter3, inter4, inter5, inter6, inter7,
+ inter8, inter9);
+ p_dst += 8;
+ LD_UB2(p_dst, dst_stride, inter0, inter1);
+ ST8x1_UB(inter2, p_dst_st);
+ ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+ LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+ ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+ ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+ LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+ ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+ ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+ LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+ ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+ ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+ p_dst_st += 8;
+ }
+}
+
+static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+ int32_t src_stride,
+ int32_t dst_stride, int32_t cols,
+ uint8_t *f) {
+ uint8_t *p_src = src_ptr;
+ uint8_t *p_dst = dst_ptr;
+ uint8_t *p_dst_st = dst_ptr;
+ uint8_t *f_orig = f;
+ uint16_t col;
+ uint64_t out0, out1, out2, out3;
+ v16u8 above2, above1, below2, below1;
+ v16u8 src, ref, ref_temp;
+ v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
+ v16u8 inter7, inter8, inter9, inter10, inter11;
+ v16u8 inter12, inter13, inter14, inter15;
+
+ for (col = (cols / 16); col--;) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ src = LD_UB(p_src + 10 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+ below1 = LD_UB(p_src + 11 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+ below2 = LD_UB(p_src + 12 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+ above2 = LD_UB(p_src + 13 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+ above1 = LD_UB(p_src + 14 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+ src = LD_UB(p_src + 15 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+ below1 = LD_UB(p_src + 16 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+ below2 = LD_UB(p_src + 17 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+ ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+ p_dst, dst_stride);
+ ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15,
+ p_dst + 8 * dst_stride, dst_stride);
+ p_src += 16;
+ p_dst += 16;
+ f += 16;
+ }
+
+ if (0 != (cols / 16)) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ src = LD_UB(p_src + 10 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+ below1 = LD_UB(p_src + 11 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+ below2 = LD_UB(p_src + 12 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+ above2 = LD_UB(p_src + 13 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+ above1 = LD_UB(p_src + 14 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+ src = LD_UB(p_src + 15 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+ below1 = LD_UB(p_src + 16 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+ below2 = LD_UB(p_src + 17 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+ out0 = __msa_copy_u_d((v2i64)inter0, 0);
+ out1 = __msa_copy_u_d((v2i64)inter1, 0);
+ out2 = __msa_copy_u_d((v2i64)inter2, 0);
+ out3 = __msa_copy_u_d((v2i64)inter3, 0);
+ SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64)inter4, 0);
+ out1 = __msa_copy_u_d((v2i64)inter5, 0);
+ out2 = __msa_copy_u_d((v2i64)inter6, 0);
+ out3 = __msa_copy_u_d((v2i64)inter7, 0);
+ SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64)inter8, 0);
+ out1 = __msa_copy_u_d((v2i64)inter9, 0);
+ out2 = __msa_copy_u_d((v2i64)inter10, 0);
+ out3 = __msa_copy_u_d((v2i64)inter11, 0);
+ SD4(out0, out1, out2, out3, p_dst + 8 * dst_stride, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64)inter12, 0);
+ out1 = __msa_copy_u_d((v2i64)inter13, 0);
+ out2 = __msa_copy_u_d((v2i64)inter14, 0);
+ out3 = __msa_copy_u_d((v2i64)inter15, 0);
+ SD4(out0, out1, out2, out3, p_dst + 12 * dst_stride, dst_stride);
+ }
+
+ f = f_orig;
+ p_dst = dst_ptr - 2;
+ LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
+ inter6, inter7);
+ LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11,
+ inter12, inter13, inter14, inter15);
+
+ for (col = 0; col < cols / 8; ++col) {
+ ref = LD_UB(f);
+ f += 8;
+ TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6,
+ inter7, inter8, inter9, inter10, inter11, inter12, inter13,
+ inter14, inter15);
+ if (0 == col) {
+ above2 = inter2;
+ above1 = inter2;
+ } else {
+ above2 = inter0;
+ above1 = inter1;
+ }
+
+ src = inter2;
+ below1 = inter3;
+ below2 = inter4;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
+ above2 = inter5;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
+ above1 = inter6;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
+ src = inter7;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
+ below1 = inter8;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
+ below2 = inter9;
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
+ if (col == (cols / 8 - 1)) {
+ above2 = inter9;
+ } else {
+ above2 = inter10;
+ }
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
+ if (col == (cols / 8 - 1)) {
+ above1 = inter9;
+ } else {
+ above1 = inter11;
+ }
+ ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
+ VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
+ inter8, inter9, inter2, inter3, inter4, inter5,
+ inter6, inter7, inter8, inter9, inter10, inter11,
+ inter12, inter13, inter14, inter15, above2, above1);
+
+ p_dst += 8;
+ LD_UB2(p_dst, dst_stride, inter0, inter1);
+ ST8x1_UB(inter2, p_dst_st);
+ ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+ LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+ ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+ ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+ LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+ ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+ ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+ LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+ ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+ ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+ LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
+ ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
+ ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
+ LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
+ ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
+ ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
+ LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
+ ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
+ ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
+ LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
+ ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
+ ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
+ p_dst_st += 8;
+ }
+}
+
+void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
+ int32_t src_stride,
+ int32_t dst_stride, int32_t cols,
+ uint8_t *f, int32_t size) {
+ if (8 == size) {
+ postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f);
+ } else if (16 == size) {
+ postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f);
+ }
+}
+
+void vpx_mbpost_proc_across_ip_msa(uint8_t *src, int32_t pitch, int32_t rows,
+ int32_t cols, int32_t flimit) {
+ int32_t row, col, cnt;
+ uint8_t *src_dup = src;
+ v16u8 src0, src1, tmp_orig;
+ v16u8 tmp = { 0 };
+ v16i8 zero = { 0 };
+ v8u16 sum_h, src_r_h, src_l_h;
+ v4u32 src_r_w;
+ v4i32 flimit_vec;
+
+ flimit_vec = __msa_fill_w(flimit);
+ for (row = rows; row--;) {
+ int32_t sum_sq;
+ int32_t sum = 0;
+ src0 = (v16u8)__msa_fill_b(src_dup[0]);
+ ST8x1_UB(src0, (src_dup - 8));
+
+ src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
+ ST_UB(src0, src_dup + cols);
+ src_dup[cols + 16] = src_dup[cols - 1];
+ tmp_orig = (v16u8)__msa_ldi_b(0);
+ tmp_orig[15] = tmp[15];
+ src1 = LD_UB(src_dup - 8);
+ src1[15] = 0;
+ ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
+ src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
+ src_r_w += __msa_dotp_u_w(src_l_h, src_l_h);
+ sum_sq = HADD_SW_S32(src_r_w) + 16;
+ sum_h = __msa_hadd_u_h(src1, src1);
+ sum = HADD_UH_U32(sum_h);
+ {
+ v16u8 src7, src8, src_r, src_l;
+ v16i8 mask;
+ v8u16 add_r, add_l;
+ v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
+ v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
+ v4i32 sub0, sub1, sub2, sub3;
+ v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+ v4i32 mul0, mul1, mul2, mul3;
+ v4i32 total0, total1, total2, total3;
+ v8i16 const8 = __msa_fill_h(8);
+
+ src7 = LD_UB(src_dup + 7);
+ src8 = LD_UB(src_dup - 8);
+ for (col = 0; col < (cols >> 4); ++col) {
+ ILVRL_B2_UB(src7, src8, src_r, src_l);
+ HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
+
+ sum_r[0] = sum + sub_r[0];
+ for (cnt = 0; cnt < 7; ++cnt) {
+ sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
+ }
+ sum_l[0] = sum_r[7] + sub_l[0];
+ for (cnt = 0; cnt < 7; ++cnt) {
+ sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
+ }
+ sum = sum_l[7];
+ src1 = LD_UB(src_dup + 16 * col);
+ ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
+ src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
+ src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
+ tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
+
+ HADD_UB2_UH(src_r, src_l, add_r, add_l);
+ UNPCK_SH_SW(sub_r, sub0, sub1);
+ UNPCK_SH_SW(sub_l, sub2, sub3);
+ ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
+ ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
+ MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1,
+ mul2, mul3);
+ sum_sq0[0] = sum_sq + mul0[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
+ }
+ sum_sq1[0] = sum_sq0[3] + mul1[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
+ }
+ sum_sq2[0] = sum_sq1[3] + mul2[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
+ }
+ sum_sq3[0] = sum_sq2[3] + mul3[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
+ }
+ sum_sq = sum_sq3[3];
+
+ UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
+ UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
+ total0 = sum_sq0 * __msa_ldi_w(15);
+ total0 -= sum0_w * sum0_w;
+ total1 = sum_sq1 * __msa_ldi_w(15);
+ total1 -= sum1_w * sum1_w;
+ total2 = sum_sq2 * __msa_ldi_w(15);
+ total2 -= sum2_w * sum2_w;
+ total3 = sum_sq3 * __msa_ldi_w(15);
+ total3 -= sum3_w * sum3_w;
+ total0 = (total0 < flimit_vec);
+ total1 = (total1 < flimit_vec);
+ total2 = (total2 < flimit_vec);
+ total3 = (total3 < flimit_vec);
+ PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+ mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+ tmp = __msa_bmz_v(tmp, src1, (v16u8)mask);
+
+ if (col == 0) {
+ uint64_t src_d;
+
+ src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
+ SD(src_d, (src_dup - 8));
+ }
+
+ src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
+ src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
+ ST_UB(tmp, (src_dup + (16 * col)));
+ }
+
+ src_dup += pitch;
+ }
+ }
+}
+
+void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
+ int32_t cols, int32_t flimit) {
+ int32_t row, col, cnt, i;
+ v4i32 flimit_vec;
+ v16u8 dst7, dst8, dst_r_b, dst_l_b;
+ v16i8 mask;
+ v8u16 add_r, add_l;
+ v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
+ v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
+
+ flimit_vec = __msa_fill_w(flimit);
+
+ for (col = 0; col < (cols >> 4); ++col) {
+ uint8_t *dst_tmp = &dst_ptr[col << 4];
+ v16u8 dst;
+ v16i8 zero = { 0 };
+ v16u8 tmp[16];
+ v8i16 mult0, mult1, rv2_0, rv2_1;
+ v8i16 sum0_h = { 0 };
+ v8i16 sum1_h = { 0 };
+ v4i32 mul0 = { 0 };
+ v4i32 mul1 = { 0 };
+ v4i32 mul2 = { 0 };
+ v4i32 mul3 = { 0 };
+ v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+ v4i32 add0, add1, add2, add3;
+ const int16_t *rv2[16];
+
+ dst = LD_UB(dst_tmp);
+ for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
+ rv2[i] = vpx_rv + (i & 7);
+ ++i;
+ }
+ for (cnt = -8; cnt < 0; ++cnt) {
+ ST_UB(dst, dst_tmp + cnt * pitch);
+ }
+
+ dst = LD_UB((dst_tmp + (rows - 1) * pitch));
+ for (cnt = rows; cnt < rows + 17; ++cnt) {
+ ST_UB(dst, dst_tmp + cnt * pitch);
+ }
+ for (cnt = -8; cnt <= 6; ++cnt) {
+ dst = LD_UB(dst_tmp + (cnt * pitch));
+ UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
+ MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
+ mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
+ mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
+ mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
+ mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
+ ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
+ }
+
+ for (row = 0; row < (rows + 8); ++row) {
+ for (i = 0; i < 8; ++i) {
+ rv2_0[i] = *(rv2[i] + (row & 127));
+ rv2_1[i] = *(rv2[i + 8] + (row & 127));
+ }
+ dst7 = LD_UB(dst_tmp + (7 * pitch));
+ dst8 = LD_UB(dst_tmp - (8 * pitch));
+ ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
+
+ HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
+ UNPCK_SH_SW(sub_r, sub0, sub1);
+ UNPCK_SH_SW(sub_l, sub2, sub3);
+ sum0_h += sub_r;
+ sum1_h += sub_l;
+
+ HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
+
+ ILVRL_H2_SW(zero, add_r, add0, add1);
+ ILVRL_H2_SW(zero, add_l, add2, add3);
+ mul0 += add0 * sub0;
+ mul1 += add1 * sub1;
+ mul2 += add2 * sub2;
+ mul3 += add3 * sub3;
+ dst = LD_UB(dst_tmp);
+ ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
+ dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
+ dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
+ tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
+
+ UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
+ UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
+ total0 = mul0 * __msa_ldi_w(15);
+ total0 -= sum0_w * sum0_w;
+ total1 = mul1 * __msa_ldi_w(15);
+ total1 -= sum1_w * sum1_w;
+ total2 = mul2 * __msa_ldi_w(15);
+ total2 -= sum2_w * sum2_w;
+ total3 = mul3 * __msa_ldi_w(15);
+ total3 -= sum3_w * sum3_w;
+ total0 = (total0 < flimit_vec);
+ total1 = (total1 < flimit_vec);
+ total2 = (total2 < flimit_vec);
+ total3 = (total3 < flimit_vec);
+ PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+ mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+ tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
+
+ if (row >= 8) {
+ ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
+ }
+
+ dst_tmp += pitch;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
new file mode 100644
index 0000000000..36583e2d24
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
@@ -0,0 +1,948 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+ int32_t src_stride,
+ int16_t *temp_buff) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 step0, step1, step2, step3;
+ v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+ v8i16 step0_1, step1_1, step2_1, step3_1;
+
+ /* 1st and 2nd set */
+ LD_SH4(input, src_stride, in0, in1, in2, in3);
+ LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
+ LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+ LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+ SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+ SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+ step3, in4, in5, in6, in7);
+ BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
+ step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+ ST_SH4(step0, step1, step2, step3, temp_buff, 8);
+ ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
+ ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
+ ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
+
+ /* 3rd and 4th set */
+ LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
+ LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
+ LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+ LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+ SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+ SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+ step3, in4, in5, in6, in7);
+ BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
+ step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+ ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
+ ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
+ ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
+ ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 temp0, temp1;
+
+ /* fdct even */
+ LD_SH4(input, 8, in0, in1, in2, in3);
+ LD_SH4(input + 96, 8, in12, in13, in14, in15);
+ BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
+ vec3, in12, in13, in14, in15);
+ LD_SH4(input + 32, 8, in4, in5, in6, in7);
+ LD_SH4(input + 64, 8, in8, in9, in10, in11);
+ BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
+ in8, in9, in10, in11);
+
+ /* Stage 3 */
+ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+ BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+ DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp);
+ ST_SH(temp1, temp + 512);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 256);
+ ST_SH(temp1, temp + 768);
+
+ SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 128);
+ ST_SH(temp1, temp + 896);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 640);
+ ST_SH(temp1, temp + 384);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 64);
+ ST_SH(temp1, temp + 960);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 576);
+ ST_SH(temp1, temp + 448);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 320);
+ ST_SH(temp1, temp + 704);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 192);
+ ST_SH(temp1, temp + 832);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+ v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+ v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+ in20 = LD_SH(input + 32);
+ in21 = LD_SH(input + 40);
+ in26 = LD_SH(input + 80);
+ in27 = LD_SH(input + 88);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ in18 = LD_SH(input + 16);
+ in19 = LD_SH(input + 24);
+ in28 = LD_SH(input + 96);
+ in29 = LD_SH(input + 104);
+
+ vec4 = in19 - in20;
+ ST_SH(vec4, input + 32);
+ vec4 = in18 - in21;
+ ST_SH(vec4, input + 40);
+ vec4 = in29 - in26;
+ ST_SH(vec4, input + 80);
+ vec4 = in28 - in27;
+ ST_SH(vec4, input + 88);
+
+ in21 = in18 + in21;
+ in20 = in19 + in20;
+ in27 = in28 + in27;
+ in26 = in29 + in26;
+
+ LD_SH4(input + 48, 8, in22, in23, in24, in25);
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+ in16 = LD_SH(input);
+ in17 = LD_SH(input + 8);
+ in30 = LD_SH(input + 112);
+ in31 = LD_SH(input + 120);
+
+ vec4 = in17 - in22;
+ ST_SH(vec4, input + 16);
+ vec4 = in16 - in23;
+ ST_SH(vec4, input + 24);
+ vec4 = in31 - in24;
+ ST_SH(vec4, input + 96);
+ vec4 = in30 - in25;
+ ST_SH(vec4, input + 104);
+
+ ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+ ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ ADD2(in27, in26, in25, in24, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr);
+ ST_SH(vec4, temp_ptr + 960);
+
+ SUB2(in27, in26, in25, in24, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 448);
+ ST_SH(vec4, temp_ptr + 512);
+
+ SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+ DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+ SUB2(in26, in27, in24, in25, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec4, temp_ptr + 704);
+ ST_SH(vec5, temp_ptr + 256);
+
+ ADD2(in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec4, temp_ptr + 192);
+ ST_SH(vec5, temp_ptr + 768);
+
+ LD_SH4(input + 16, 8, in22, in23, in20, in21);
+ LD_SH4(input + 80, 8, in26, in27, in24, in25);
+ in16 = in20;
+ in17 = in21;
+ DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+ SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ ADD2(in28, in29, in31, in30, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 832);
+ ST_SH(vec4, temp_ptr + 128);
+
+ SUB2(in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 320);
+ ST_SH(vec4, temp_ptr + 640);
+ ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+ DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+ SUB2(in29, in28, in30, in31, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 576);
+ ST_SH(vec4, temp_ptr + 384);
+
+ ADD2(in29, in28, in30, in31, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 64);
+ ST_SH(vec4, temp_ptr + 896);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+ int16_t *tmp_buf, int16_t *tmp_buf_big) {
+ fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+ fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+ fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+ int16_t *output) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
+
+ LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
+ step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
+
+ /* 2nd set */
+ LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
+ step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
+ (output + 8 * 8), 8);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+ int16_t *out) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+ v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+ v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+ /* fdct32 even */
+ /* stage 2 */
+ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+ vec7, in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
+
+ /* Stage 3 */
+ UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+ UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+ UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+ UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+ UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+ UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+ UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+ UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+ ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
+ tmp1_w, tmp2_w, tmp3_w);
+ BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
+ ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
+ vec1_r, vec2_r, vec3_r);
+
+ tmp3_w = vec0_r + vec3_r;
+ vec0_r = vec0_r - vec3_r;
+ vec3_r = vec1_r + vec2_r;
+ vec1_r = vec1_r - vec2_r;
+
+ DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
+ vec4_r, tmp3_w, vec6_r, vec3_r);
+ FDCT32_POSTPROC_NEG_W(vec4_r);
+ FDCT32_POSTPROC_NEG_W(tmp3_w);
+ FDCT32_POSTPROC_NEG_W(vec6_r);
+ FDCT32_POSTPROC_NEG_W(vec3_r);
+ PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+ ST_SH2(vec5, vec4, out, 8);
+
+ DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
+ vec4_r, tmp3_w, vec6_r, vec3_r);
+ FDCT32_POSTPROC_NEG_W(vec4_r);
+ FDCT32_POSTPROC_NEG_W(tmp3_w);
+ FDCT32_POSTPROC_NEG_W(vec6_r);
+ FDCT32_POSTPROC_NEG_W(vec3_r);
+ PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+ ST_SH2(vec5, vec4, out + 16, 8);
+
+ LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 32);
+ ST_SH(in5, out + 56);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 40);
+ ST_SH(in5, out + 48);
+
+ LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 64);
+ ST_SH(in5, out + 120);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 72);
+ ST_SH(in5, out + 112);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 80);
+ ST_SH(in5, out + 104);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 96);
+ ST_SH(in5, out + 88);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+ /* fdct32 even */
+ /* stage 2 */
+ LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+ vec7, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ /* Stage 3 */
+ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+ BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+ DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out);
+ ST_SH(temp1, out + 8);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 16);
+ ST_SH(temp1, out + 24);
+
+ SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 32);
+ ST_SH(temp1, out + 56);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 40);
+ ST_SH(temp1, out + 48);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 64);
+ ST_SH(temp1, out + 120);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 72);
+ ST_SH(temp1, out + 112);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 80);
+ ST_SH(temp1, out + 104);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 96);
+ ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+ int16_t *out) {
+ v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+ v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+ in20 = LD_SH(temp + 32);
+ in21 = LD_SH(temp + 40);
+ in26 = LD_SH(temp + 80);
+ in27 = LD_SH(temp + 88);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ in18 = LD_SH(temp + 16);
+ in19 = LD_SH(temp + 24);
+ in28 = LD_SH(temp + 96);
+ in29 = LD_SH(temp + 104);
+
+ vec4 = in19 - in20;
+ ST_SH(vec4, interm_ptr + 32);
+ vec4 = in18 - in21;
+ ST_SH(vec4, interm_ptr + 88);
+ vec4 = in28 - in27;
+ ST_SH(vec4, interm_ptr + 56);
+ vec4 = in29 - in26;
+ ST_SH(vec4, interm_ptr + 64);
+
+ ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+ in22 = LD_SH(temp + 48);
+ in23 = LD_SH(temp + 56);
+ in24 = LD_SH(temp + 64);
+ in25 = LD_SH(temp + 72);
+
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+ in16 = LD_SH(temp);
+ in17 = LD_SH(temp + 8);
+ in30 = LD_SH(temp + 112);
+ in31 = LD_SH(temp + 120);
+
+ vec4 = in17 - in22;
+ ST_SH(vec4, interm_ptr + 40);
+ vec4 = in30 - in25;
+ ST_SH(vec4, interm_ptr + 48);
+ vec4 = in31 - in24;
+ ST_SH(vec4, interm_ptr + 72);
+ vec4 = in16 - in23;
+ ST_SH(vec4, interm_ptr + 80);
+
+ ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+ ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ ADD2(in27, in26, in25, in24, in23, in20);
+
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out);
+ ST_SH(vec4, out + 120);
+
+ SUB2(in27, in26, in25, in24, in22, in21);
+
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 112);
+ ST_SH(vec4, out + 8);
+
+ SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+ DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+ SUB2(in26, in27, in24, in25, in23, in20);
+
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec4, out + 16);
+ ST_SH(vec5, out + 104);
+
+ ADD2(in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec4, out + 24);
+ ST_SH(vec5, out + 96);
+
+ in20 = LD_SH(interm_ptr + 32);
+ in21 = LD_SH(interm_ptr + 88);
+ in27 = LD_SH(interm_ptr + 56);
+ in26 = LD_SH(interm_ptr + 64);
+
+ in16 = in20;
+ in17 = in21;
+ DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+ in22 = LD_SH(interm_ptr + 40);
+ in25 = LD_SH(interm_ptr + 48);
+ in24 = LD_SH(interm_ptr + 72);
+ in23 = LD_SH(interm_ptr + 80);
+
+ SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ ADD2(in28, in29, in31, in30, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 32);
+ ST_SH(vec4, out + 88);
+
+ SUB2(in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 40);
+ ST_SH(vec4, out + 80);
+
+ ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+ DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+ SUB2(in29, in28, in30, in31, in16, in19);
+
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 72);
+ ST_SH(vec4, out + 48);
+
+ ADD2(in29, in28, in30, in31, in17, in18);
+
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec4, out + 56);
+ ST_SH(vec5, out + 64);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+ /* 1st set */
+ in0 = LD_SH(temp);
+ in4 = LD_SH(temp + 32);
+ in2 = LD_SH(temp + 64);
+ in6 = LD_SH(temp + 96);
+ in1 = LD_SH(temp + 128);
+ in7 = LD_SH(temp + 152);
+ in3 = LD_SH(temp + 192);
+ in5 = LD_SH(temp + 216);
+
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ /* 2nd set */
+ in0_1 = LD_SH(temp + 16);
+ in1_1 = LD_SH(temp + 232);
+ in2_1 = LD_SH(temp + 80);
+ in3_1 = LD_SH(temp + 168);
+ in4_1 = LD_SH(temp + 48);
+ in5_1 = LD_SH(temp + 176);
+ in6_1 = LD_SH(temp + 112);
+ in7_1 = LD_SH(temp + 240);
+
+ ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
+ TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+ /* 3rd set */
+ in0 = LD_SH(temp + 8);
+ in1 = LD_SH(temp + 136);
+ in2 = LD_SH(temp + 72);
+ in3 = LD_SH(temp + 200);
+ in4 = LD_SH(temp + 40);
+ in5 = LD_SH(temp + 208);
+ in6 = LD_SH(temp + 104);
+ in7 = LD_SH(temp + 144);
+
+ ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
+ 32);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
+
+ /* 4th set */
+ in0_1 = LD_SH(temp + 24);
+ in1_1 = LD_SH(temp + 224);
+ in2_1 = LD_SH(temp + 88);
+ in3_1 = LD_SH(temp + 160);
+ in4_1 = LD_SH(temp + 56);
+ in5_1 = LD_SH(temp + 184);
+ in6_1 = LD_SH(temp + 120);
+ in7_1 = LD_SH(temp + 248);
+
+ TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+ ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
+ 32);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+ fdct8x32_1d_row_even(temp_buf, temp_buf);
+ fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+ fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+ int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+ fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+ /* column transform */
+ for (i = 0; i < 4; ++i) {
+ fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+ tmp_buf_big + (8 * i));
+ }
+
+ /* row transform */
+ fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+ /* row transform */
+ for (i = 1; i < 4; ++i) {
+ fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+ }
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+ /* fdct32 even */
+ /* stage 2 */
+ LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+ vec7, in8, in9, in10, in11, in12, in13, in14, in15);
+ FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+ FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+ FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+ FDCT_POSTPROC_2V_NEG_H(in8, in9);
+ FDCT_POSTPROC_2V_NEG_H(in10, in11);
+ FDCT_POSTPROC_2V_NEG_H(in12, in13);
+ FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+ /* Stage 3 */
+ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+
+ temp0 = in0 + in3;
+ in0 = in0 - in3;
+ in3 = in1 + in2;
+ in1 = in1 - in2;
+
+ DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+ ST_SH(temp0, out);
+ ST_SH(temp1, out + 8);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ ST_SH(temp0, out + 16);
+ ST_SH(temp1, out + 24);
+
+ SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ ST_SH(temp0, out + 32);
+ ST_SH(temp1, out + 56);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ ST_SH(temp0, out + 40);
+ ST_SH(temp1, out + 48);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ ST_SH(temp0, out + 64);
+ ST_SH(temp1, out + 120);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ ST_SH(temp0, out + 72);
+ ST_SH(temp1, out + 112);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ ST_SH(temp0, out + 80);
+ ST_SH(temp1, out + 104);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ ST_SH(temp0, out + 96);
+ ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+ int16_t *out) {
+ v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+ v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
+ v8i16 vec4, vec5;
+
+ in20 = LD_SH(temp + 32);
+ in21 = LD_SH(temp + 40);
+ in26 = LD_SH(temp + 80);
+ in27 = LD_SH(temp + 88);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ FDCT_POSTPROC_2V_NEG_H(in20, in21);
+ FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+ in18 = LD_SH(temp + 16);
+ in19 = LD_SH(temp + 24);
+ in28 = LD_SH(temp + 96);
+ in29 = LD_SH(temp + 104);
+
+ FDCT_POSTPROC_2V_NEG_H(in18, in19);
+ FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+ vec4 = in19 - in20;
+ ST_SH(vec4, interm_ptr + 32);
+ vec4 = in18 - in21;
+ ST_SH(vec4, interm_ptr + 88);
+ vec4 = in29 - in26;
+ ST_SH(vec4, interm_ptr + 64);
+ vec4 = in28 - in27;
+ ST_SH(vec4, interm_ptr + 56);
+
+ ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+ in22 = LD_SH(temp + 48);
+ in23 = LD_SH(temp + 56);
+ in24 = LD_SH(temp + 64);
+ in25 = LD_SH(temp + 72);
+
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+ FDCT_POSTPROC_2V_NEG_H(in22, in23);
+ FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+ in16 = LD_SH(temp);
+ in17 = LD_SH(temp + 8);
+ in30 = LD_SH(temp + 112);
+ in31 = LD_SH(temp + 120);
+
+ FDCT_POSTPROC_2V_NEG_H(in16, in17);
+ FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+ vec4 = in17 - in22;
+ ST_SH(vec4, interm_ptr + 40);
+ vec4 = in30 - in25;
+ ST_SH(vec4, interm_ptr + 48);
+ vec4 = in31 - in24;
+ ST_SH(vec4, interm_ptr + 72);
+ vec4 = in16 - in23;
+ ST_SH(vec4, interm_ptr + 80);
+
+ ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+ ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ ADD2(in27, in26, in25, in24, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ ST_SH(vec5, out);
+ ST_SH(vec4, out + 120);
+
+ SUB2(in27, in26, in25, in24, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ ST_SH(vec5, out + 112);
+ ST_SH(vec4, out + 8);
+
+ SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+ DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+ SUB2(in26, in27, in24, in25, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ ST_SH(vec4, out + 16);
+ ST_SH(vec5, out + 104);
+
+ ADD2(in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ ST_SH(vec4, out + 24);
+ ST_SH(vec5, out + 96);
+
+ in20 = LD_SH(interm_ptr + 32);
+ in21 = LD_SH(interm_ptr + 88);
+ in27 = LD_SH(interm_ptr + 56);
+ in26 = LD_SH(interm_ptr + 64);
+
+ in16 = in20;
+ in17 = in21;
+ DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+ in22 = LD_SH(interm_ptr + 40);
+ in25 = LD_SH(interm_ptr + 48);
+ in24 = LD_SH(interm_ptr + 72);
+ in23 = LD_SH(interm_ptr + 80);
+
+ SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ in16 = in28 + in29;
+ in19 = in31 + in30;
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ ST_SH(vec5, out + 32);
+ ST_SH(vec4, out + 88);
+
+ SUB2(in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ ST_SH(vec5, out + 40);
+ ST_SH(vec4, out + 80);
+
+ ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+ DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+ SUB2(in29, in28, in30, in31, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ ST_SH(vec5, out + 72);
+ ST_SH(vec4, out + 48);
+
+ ADD2(in29, in28, in30, in31, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ ST_SH(vec4, out + 56);
+ ST_SH(vec5, out + 64);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+ int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+ fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+ fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+ /* column transform */
+ for (i = 0; i < 4; ++i) {
+ fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+ &tmp_buf_big[0] + (8 * i));
+ }
+
+ /* row transform */
+ for (i = 0; i < 4; ++i) {
+ fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+ out + (8 * i * 32));
+ }
+}
+
+void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+ int sum, i;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v4i32 vec_w = { 0 };
+
+ for (i = 0; i < 16; ++i) {
+ LD_SH4(input, 8, in0, in1, in2, in3);
+ input += stride;
+ LD_SH4(input, 8, in4, in5, in6, in7);
+ input += stride;
+ ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+ ADD2(in0, in2, in4, in6, in0, in4);
+ vec_w += __msa_hadd_s_w(in0, in0);
+ vec_w += __msa_hadd_s_w(in4, in4);
+ }
+
+ sum = HADD_SW_S32(vec_w);
+ out[0] = (int16_t)(sum >> 3);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
new file mode 100644
index 0000000000..5a6dfcef2f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+
+void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *out, int32_t stride) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v4i32 vec_w;
+
+ LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+ ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+ ADD2(in0, in2, in4, in6, in0, in4);
+ vec_w = __msa_hadd_s_w(in0, in0);
+ vec_w += __msa_hadd_s_w(in4, in4);
+ out[0] = HADD_SW_S32(vec_w);
+ out[1] = 0;
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+ int32_t src_stride) {
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+ v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+ v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
+ v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
+ cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
+ v8i16 coeff2 = {
+ -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
+ };
+
+ LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+ SLLI_4V(in8, in9, in10, in11, 2);
+ SLLI_4V(in12, in13, in14, in15, 2);
+ ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
+ ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
+ SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
+ SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
+
+ tmp_ptr += 16;
+
+ /* stp 1 */
+ ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
+ ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);
+
+ cnst4 = __msa_splati_h(coeff, 0);
+ stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);
+
+ cnst5 = __msa_splati_h(coeff, 1);
+ cnst5 = __msa_ilvev_h(cnst5, cnst4);
+ stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
+ stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
+ stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);
+
+ /* stp2 */
+ BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+ BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+ ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
+ ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
+ SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);
+
+ cnst0 = __msa_splati_h(coeff, 4);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+ stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);
+
+ BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+ ILVRL_H2_SH(in15, in8, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr);
+
+ cnst0 = __msa_splati_h(coeff2, 0);
+ cnst0 = __msa_ilvev_h(cnst1, cnst0);
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 224);
+
+ ILVRL_H2_SH(in14, in9, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+ ST_SH(in8, tmp_ptr + 128);
+
+ cnst1 = __msa_splati_h(coeff2, 2);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 96);
+
+ SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+ stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+ cnst1 = __msa_splati_h(coeff, 3);
+ cnst1 = __msa_ilvev_h(cnst0, cnst1);
+ stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+ /* stp4 */
+ ADD2(stp34, stp25, stp33, stp22, in13, in10);
+
+ ILVRL_H2_SH(in13, in10, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 64);
+
+ cnst0 = __msa_splati_h(coeff2, 1);
+ cnst0 = __msa_ilvev_h(cnst1, cnst0);
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 160);
+
+ SUB2(stp34, stp25, stp33, stp22, in12, in11);
+ ILVRL_H2_SH(in12, in11, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+ ST_SH(in8, tmp_ptr + 192);
+
+ cnst1 = __msa_splati_h(coeff2, 3);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 32);
+}
+
+void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+
+ LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+ ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
+ ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15);
+ SRA_4V(in0, in1, in2, in3, 2);
+ SRA_4V(in4, in5, in6, in7, 2);
+ SRA_4V(in8, in9, in10, in11, 2);
+ SRA_4V(in12, in13, in14, in15, 2);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
+ tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+ tmp1, in1, tmp2, in2, tmp3, in3);
+ ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
+ TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+ tmp5, in5, tmp6, in6, tmp7, in7);
+ ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
+}
+
+void vpx_fdct4x4_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ v8i16 in0, in1, in2, in3;
+
+ LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+ /* fdct4 pre-process */
+ {
+ v8i16 vec, mask;
+ v16i8 zero = { 0 };
+ v16i8 one = __msa_ldi_b(1);
+
+ mask = (v8i16)__msa_sldi_b(zero, one, 15);
+ SLLI_4V(in0, in1, in2, in3, 4);
+ vec = __msa_ceqi_h(in0, 0);
+ vec = vec ^ 255;
+ vec = mask & vec;
+ in0 += vec;
+ }
+
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ SRA_4V(in0, in1, in2, in3, 2);
+ PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+ ST_SH2(in0, in2, output, 8);
+}
+
+void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+ LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+ ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
+}
+
+void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+ }
+}
+
+void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+ int sum, i;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v4i32 vec_w = { 0 };
+
+ for (i = 0; i < 4; ++i) {
+ LD_SH2(input, 8, in0, in1);
+ input += stride;
+ LD_SH2(input, 8, in2, in3);
+ input += stride;
+ LD_SH2(input, 8, in4, in5);
+ input += stride;
+ LD_SH2(input, 8, in6, in7);
+ input += stride;
+ ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+ ADD2(in0, in2, in4, in6, in0, in4);
+ vec_w += __msa_hadd_s_w(in0, in0);
+ vec_w += __msa_hadd_s_w(in4, in4);
+ }
+
+ sum = HADD_SW_S32(vec_w);
+ out[0] = (int16_t)(sum >> 1);
+}
+#endif // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
new file mode 100644
index 0000000000..c0be56b819
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+#define VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+
+#include "vpx_dsp/mips/txfm_macros_msa.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 coeff_m = { \
+ cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \
+ }; \
+ \
+ BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
+ ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+ SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \
+ cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \
+ vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
+ \
+ vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \
+ cnst2_m = __msa_splati_h(coeff_m, 2); \
+ cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \
+ vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
+ \
+ SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \
+ PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \
+ vec7_m, out0, out2, out1, out3); \
+ }
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
+ { \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ \
+ SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \
+ SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \
+ AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \
+ in2, in3); \
+ AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \
+ in6, in7); \
+ }
+
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
+ v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
+ cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
+ \
+ /* FDCT stage1 */ \
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \
+ s3_m, s4_m, s5_m, s6_m, s7_m); \
+ BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x1_m, x0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \
+ x2_m = -x2_m; \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ x2_m = __msa_splati_h(coeff_m, 2); \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ /* stage2 */ \
+ ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \
+ \
+ s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ /* stage3 */ \
+ BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x0_m, x1_m); \
+ out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ \
+ x1_m = __msa_splati_h(coeff_m, 5); \
+ x0_m = -x0_m; \
+ x0_m = __msa_ilvev_h(x1_m, x0_m); \
+ out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \
+ \
+ x2_m = __msa_splati_h(coeff_m, 6); \
+ x3_m = -x3_m; \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ }
+
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ v8i16 x0_m, x1_m, x2_m, x3_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
+ cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
+ \
+ /* FDCT stage1 */ \
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \
+ s3_m, s4_m, s5_m, s6_m, s7_m); \
+ BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x1_m, x0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \
+ x2_m = -x2_m; \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ x2_m = __msa_splati_h(coeff_m, 2); \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ /* stage2 */ \
+ ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \
+ \
+ s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ /* stage3 */ \
+ BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x0_m, x1_m); \
+ out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ \
+ x1_m = __msa_splati_h(coeff_m, 5); \
+ x0_m = -x0_m; \
+ x0_m = __msa_ilvev_h(x1_m, x0_m); \
+ out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \
+ \
+ x2_m = __msa_splati_h(coeff_m, 6); \
+ x3_m = -x3_m; \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ }
+
+#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \
+ input7, out1, out3, out5, out7, out9, out11, out13, \
+ out15) \
+ { \
+ v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \
+ v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \
+ v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \
+ v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \
+ v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
+ -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \
+ v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \
+ cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \
+ v8i16 coeff2_m = { \
+ -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \
+ }; \
+ \
+ /* stp 1 */ \
+ ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \
+ ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \
+ \
+ cnst4_m = __msa_splati_h(coeff_m, 0); \
+ stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \
+ \
+ cnst5_m = __msa_splati_h(coeff_m, 1); \
+ cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \
+ stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \
+ stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \
+ stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \
+ \
+ /* stp2 */ \
+ BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \
+ stp33_m); \
+ BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \
+ stp34_m); \
+ \
+ ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \
+ ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff_m, 4); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff_m, 3); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
+ \
+ /* stp4 */ \
+ BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \
+ vec5_m); \
+ BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \
+ stp31_m); \
+ \
+ ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ \
+ out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff2_m, 0); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ \
+ cnst1_m = __msa_splati_h(coeff2_m, 2); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff2_m, 1); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ \
+ cnst1_m = __msa_splati_h(coeff2_m, 3); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ }
+
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
+ { \
+ v8i16 tp0_m, tp1_m; \
+ v8i16 one_m = __msa_ldi_h(1); \
+ \
+ tp0_m = __msa_clti_s_h(vec0, 0); \
+ tp1_m = __msa_clti_s_h(vec1, 0); \
+ vec0 += 1; \
+ vec1 += 1; \
+ tp0_m = one_m & tp0_m; \
+ tp1_m = one_m & tp1_m; \
+ vec0 += tp0_m; \
+ vec1 += tp1_m; \
+ vec0 >>= 2; \
+ vec1 >>= 2; \
+ }
+
+#define FDCT32_POSTPROC_NEG_W(vec) \
+ { \
+ v4i32 temp_m; \
+ v4i32 one_m = __msa_ldi_w(1); \
+ \
+ temp_m = __msa_clti_s_w(vec, 0); \
+ vec += 1; \
+ temp_m = one_m & temp_m; \
+ vec += temp_m; \
+ vec >>= 2; \
+ }
+
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
+ { \
+ v8i16 tp0_m, tp1_m; \
+ v8i16 one = __msa_ldi_h(1); \
+ \
+ tp0_m = __msa_clei_s_h(vec0, 0); \
+ tp1_m = __msa_clei_s_h(vec1, 0); \
+ tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \
+ tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \
+ vec0 += 1; \
+ vec1 += 1; \
+ tp0_m = one & tp0_m; \
+ tp1_m = one & tp1_m; \
+ vec0 += tp0_m; \
+ vec1 += tp1_m; \
+ vec0 >>= 2; \
+ vec1 >>= 2; \
+ }
+
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
+ const0, const1, out0, out1, out2, out3) \
+ { \
+ v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \
+ v4i32 k0_m = __msa_fill_w((int32_t)const0); \
+ \
+ s0_m = __msa_fill_w((int32_t)const1); \
+ k0_m = __msa_ilvev_w(s0_m, k0_m); \
+ \
+ ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \
+ ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \
+ ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \
+ ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \
+ \
+ DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \
+ DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \
+ tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
+ tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
+ tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
+ tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
+ out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
+ out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
+ \
+ DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \
+ DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \
+ tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
+ tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
+ tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
+ tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
+ out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
+ out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
+ }
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+ int32_t src_stride);
+void fdct16x8_1d_row(int16_t *input, int16_t *output);
+#endif // VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c
new file mode 100644
index 0000000000..7ca61a28ec
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
+ v8i16 loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+ v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+ v8i16 tmp5, tmp6, tmp7;
+
+ LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+ input += 8;
+ LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+ TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1,
+ reg2, reg3, reg4, reg5, reg6, reg7);
+ TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8,
+ reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+ DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+ DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+ BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+ DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+ DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+ BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+ SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4,
+ reg8);
+ ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6,
+ reg10);
+
+ /* stage 2 */
+ DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+ DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+ reg9 = reg1 - loc2;
+ reg1 = reg1 + loc2;
+ reg7 = reg15 - loc3;
+ reg15 = reg15 + loc3;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+ BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+ loc1 = reg15 + reg3;
+ reg3 = reg15 - reg3;
+ loc2 = reg2 + loc1;
+ reg15 = reg2 - loc1;
+
+ loc1 = reg1 + reg13;
+ reg13 = reg1 - reg13;
+ loc0 = reg0 + loc1;
+ loc1 = reg0 - loc1;
+ tmp6 = loc0;
+ tmp7 = loc1;
+ reg0 = loc2;
+
+ DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+ DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+ loc0 = reg9 + reg5;
+ reg5 = reg9 - reg5;
+ reg2 = reg6 + loc0;
+ reg1 = reg6 - loc0;
+
+ loc0 = reg7 + reg11;
+ reg11 = reg7 - reg11;
+ loc1 = reg4 + loc0;
+ loc2 = reg4 - loc0;
+ tmp5 = loc1;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+ BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+ reg10 = loc0;
+ reg11 = loc1;
+
+ DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+ BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+
+ reg13 = loc2;
+
+ /* Transpose and store the output */
+ reg12 = tmp5;
+ reg14 = tmp6;
+ reg3 = tmp7;
+
+ /* transpose block */
+ TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0,
+ reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+ ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
+
+ /* transpose block */
+ TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3,
+ reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+ ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
+}
+
+void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+ v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+ v8i16 tmp5, tmp6, tmp7;
+
+ /* load up 8x8 */
+ LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+ input += 8 * 16;
+ /* load bottom 8x8 */
+ LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+ DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+ DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+ BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+ DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+ DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+ BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+
+ reg0 = reg2 - loc1;
+ reg2 = reg2 + loc1;
+ reg12 = reg14 - loc0;
+ reg14 = reg14 + loc0;
+ reg4 = reg6 - loc3;
+ reg6 = reg6 + loc3;
+ reg8 = reg10 - loc2;
+ reg10 = reg10 + loc2;
+
+ /* stage 2 */
+ DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+ DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+ reg9 = reg1 - loc2;
+ reg1 = reg1 + loc2;
+ reg7 = reg15 - loc3;
+ reg15 = reg15 + loc3;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+ BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+ loc1 = reg15 + reg3;
+ reg3 = reg15 - reg3;
+ loc2 = reg2 + loc1;
+ reg15 = reg2 - loc1;
+
+ loc1 = reg1 + reg13;
+ reg13 = reg1 - reg13;
+ loc0 = reg0 + loc1;
+ loc1 = reg0 - loc1;
+ tmp6 = loc0;
+ tmp7 = loc1;
+ reg0 = loc2;
+
+ DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+ DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+ loc0 = reg9 + reg5;
+ reg5 = reg9 - reg5;
+ reg2 = reg6 + loc0;
+ reg1 = reg6 - loc0;
+
+ loc0 = reg7 + reg11;
+ reg11 = reg7 - reg11;
+ loc1 = reg4 + loc0;
+ loc2 = reg4 - loc0;
+ tmp5 = loc1;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+ BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+ reg10 = loc0;
+ reg11 = loc1;
+
+ DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+ BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+ reg13 = loc2;
+
+ /* Transpose and store the output */
+ reg12 = tmp5;
+ reg14 = tmp6;
+ reg3 = tmp7;
+
+ SRARI_H4_SH(reg0, reg2, reg4, reg6, 6);
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
+ dst += (4 * dst_stride);
+ SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
+ dst += (4 * dst_stride);
+ SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
+ dst += (4 * dst_stride);
+ SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
+}
+
+void vpx_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+ int16_t *out = out_arr;
+
+ /* transform rows */
+ for (i = 0; i < 2; ++i) {
+ /* process 16 * 8 block */
+ vpx_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ /* process 8 * 16 block */
+ vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ uint8_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+ int16_t *out = out_arr;
+
+ /* process 16 * 8 block */
+ vpx_idct16_1d_rows_msa(input, out);
+
+ /* short case just considers top 4 rows as valid output */
+ out += 4 * 16;
+ for (i = 12; i--;) {
+ __asm__ __volatile__(
+ "sw $zero, 0(%[out]) \n\t"
+ "sw $zero, 4(%[out]) \n\t"
+ "sw $zero, 8(%[out]) \n\t"
+ "sw $zero, 12(%[out]) \n\t"
+ "sw $zero, 16(%[out]) \n\t"
+ "sw $zero, 20(%[out]) \n\t"
+ "sw $zero, 24(%[out]) \n\t"
+ "sw $zero, 28(%[out]) \n\t"
+
+ :
+ : [out] "r"(out));
+
+ out += 16;
+ }
+
+ out = out_arr;
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ /* process 8 * 16 block */
+ vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ uint8_t i;
+ int16_t out;
+ v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
+ v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+
+ out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO(out, 6);
+
+ vec = __msa_fill_h(out);
+
+ for (i = 4; i--;) {
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ UNPCK_UB_SH(dst0, res0, res4);
+ UNPCK_UB_SH(dst1, res1, res5);
+ UNPCK_UB_SH(dst2, res2, res6);
+ UNPCK_UB_SH(dst3, res3, res7);
+ ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+ ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
+ CLIP_SH4_0_255(res0, res1, res2, res3);
+ CLIP_SH4_0_255(res4, res5, res6, res7);
+ PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
+ tmp2, tmp3);
+ ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+ /* load input data */
+ LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
+ l7, l15);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6,
+ l7);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11,
+ l12, l13, l14, l15);
+
+ /* ADST in horizontal */
+ VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13,
+ l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,
+ r12, r13, r14, r15);
+
+ l1 = -r8;
+ l3 = -r4;
+ l13 = -r13;
+ l15 = -r1;
+
+ TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5,
+ l6, l7);
+ ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
+ TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12,
+ l13, l14, l15);
+ ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
+}
+
+void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+ v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
+ v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+ v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+ v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+ v16i8 zero = { 0 };
+
+ r0 = LD_SH(input + 0 * 16);
+ r3 = LD_SH(input + 3 * 16);
+ r4 = LD_SH(input + 4 * 16);
+ r7 = LD_SH(input + 7 * 16);
+ r8 = LD_SH(input + 8 * 16);
+ r11 = LD_SH(input + 11 * 16);
+ r12 = LD_SH(input + 12 * 16);
+ r15 = LD_SH(input + 15 * 16);
+
+ /* stage 1 */
+ k0 = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
+ k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
+ k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
+ k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
+ MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+ k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
+ k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
+ k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
+ k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
+ MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+ BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
+ k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+ k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+ k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
+ MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+ r1 = LD_SH(input + 1 * 16);
+ r2 = LD_SH(input + 2 * 16);
+ r5 = LD_SH(input + 5 * 16);
+ r6 = LD_SH(input + 6 * 16);
+ r9 = LD_SH(input + 9 * 16);
+ r10 = LD_SH(input + 10 * 16);
+ r13 = LD_SH(input + 13 * 16);
+ r14 = LD_SH(input + 14 * 16);
+
+ k0 = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
+ k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
+ k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
+ k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
+ MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
+ k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
+ k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
+ k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
+ k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
+ MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
+ BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
+ BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
+ out1 = -out1;
+ SRARI_H2_SH(out0, out1, 6);
+ dst0 = LD_UB(dst + 0 * dst_stride);
+ dst1 = LD_UB(dst + 15 * dst_stride);
+ ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1);
+ ADD2(res0, out0, res1, out1, res0, res1);
+ CLIP_SH2_0_255(res0, res1);
+ PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
+ ST8x1_UB(res0, dst);
+ ST8x1_UB(res1, dst + 15 * dst_stride);
+
+ k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+ k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+ k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
+ MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+ BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+ out8 = -out8;
+
+ SRARI_H2_SH(out8, out9, 6);
+ dst8 = LD_UB(dst + 1 * dst_stride);
+ dst9 = LD_UB(dst + 14 * dst_stride);
+ ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9);
+ ADD2(res8, out8, res9, out9, res8, res9);
+ CLIP_SH2_0_255(res8, res9);
+ PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
+ ST8x1_UB(res8, dst + dst_stride);
+ ST8x1_UB(res9, dst + 14 * dst_stride);
+
+ k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+ k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+ k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
+ MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
+ out4 = -out4;
+ SRARI_H2_SH(out4, out5, 6);
+ dst4 = LD_UB(dst + 3 * dst_stride);
+ dst5 = LD_UB(dst + 12 * dst_stride);
+ ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5);
+ ADD2(res4, out4, res5, out5, res4, res5);
+ CLIP_SH2_0_255(res4, res5);
+ PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
+ ST8x1_UB(res4, dst + 3 * dst_stride);
+ ST8x1_UB(res5, dst + 12 * dst_stride);
+
+ MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+ out13 = -out13;
+ SRARI_H2_SH(out12, out13, 6);
+ dst12 = LD_UB(dst + 2 * dst_stride);
+ dst13 = LD_UB(dst + 13 * dst_stride);
+ ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13);
+ ADD2(res12, out12, res13, out13, res12, res13);
+ CLIP_SH2_0_255(res12, res13);
+ PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
+ ST8x1_UB(res12, dst + 2 * dst_stride);
+ ST8x1_UB(res13, dst + 13 * dst_stride);
+
+ k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+ k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+ MADD_SHORT(out6, out7, k0, k3, out6, out7);
+ SRARI_H2_SH(out6, out7, 6);
+ dst6 = LD_UB(dst + 4 * dst_stride);
+ dst7 = LD_UB(dst + 11 * dst_stride);
+ ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7);
+ ADD2(res6, out6, res7, out7, res6, res7);
+ CLIP_SH2_0_255(res6, res7);
+ PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
+ ST8x1_UB(res6, dst + 4 * dst_stride);
+ ST8x1_UB(res7, dst + 11 * dst_stride);
+
+ MADD_SHORT(out10, out11, k0, k3, out10, out11);
+ SRARI_H2_SH(out10, out11, 6);
+ dst10 = LD_UB(dst + 6 * dst_stride);
+ dst11 = LD_UB(dst + 9 * dst_stride);
+ ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11);
+ ADD2(res10, out10, res11, out11, res10, res11);
+ CLIP_SH2_0_255(res10, res11);
+ PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
+ ST8x1_UB(res10, dst + 6 * dst_stride);
+ ST8x1_UB(res11, dst + 9 * dst_stride);
+
+ k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
+ k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+ MADD_SHORT(h10, h11, k1, k2, out2, out3);
+ SRARI_H2_SH(out2, out3, 6);
+ dst2 = LD_UB(dst + 7 * dst_stride);
+ dst3 = LD_UB(dst + 8 * dst_stride);
+ ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3);
+ ADD2(res2, out2, res3, out3, res2, res3);
+ CLIP_SH2_0_255(res2, res3);
+ PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
+ ST8x1_UB(res2, dst + 7 * dst_stride);
+ ST8x1_UB(res3, dst + 8 * dst_stride);
+
+ MADD_SHORT(out14, out15, k1, k2, out14, out15);
+ SRARI_H2_SH(out14, out15, 6);
+ dst14 = LD_UB(dst + 5 * dst_stride);
+ dst15 = LD_UB(dst + 10 * dst_stride);
+ ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
+ ADD2(res14, out14, res15, out15, res14, res15);
+ CLIP_SH2_0_255(res14, res15);
+ PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
+ ST8x1_UB(res14, dst + 5 * dst_stride);
+ ST8x1_UB(res15, dst + 10 * dst_stride);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c
new file mode 100644
index 0000000000..053948183a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+static void idct32x8_row_transpose_store(const int16_t *input,
+ int16_t *tmp_buf) {
+ v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+ /* 1st & 2nd 8x8 */
+ LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
+ LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
+ ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
+ ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
+
+ /* 3rd & 4th 8x8 */
+ LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
+ LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
+ ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
+ ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
+ ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8);
+}
+
+static void idct32x8_row_even_process_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+ /* Even stage 1 */
+ LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+ BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ loc1 = vec3;
+ loc0 = vec1;
+
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+ BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+ BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+ BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+ /* Even stage 2 */
+ LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+ vec0 = reg0 + reg4;
+ reg0 = reg0 - reg4;
+ reg4 = reg6 + reg2;
+ reg6 = reg6 - reg2;
+ reg2 = reg1 + reg5;
+ reg1 = reg1 - reg5;
+ reg5 = reg7 + reg3;
+ reg7 = reg7 - reg3;
+ reg3 = vec0;
+
+ vec1 = reg2;
+ reg2 = reg3 + reg4;
+ reg3 = reg3 - reg4;
+ reg4 = reg5 - vec1;
+ reg5 = reg5 + vec1;
+
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+ vec0 = reg0 - reg6;
+ reg0 = reg0 + reg6;
+ vec1 = reg7 - reg1;
+ reg7 = reg7 + reg1;
+
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+ /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+ BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+ ST_SH(loc0, (tmp_eve_buf + 15 * 8));
+ ST_SH(loc1, (tmp_eve_buf));
+ ST_SH(loc2, (tmp_eve_buf + 14 * 8));
+ ST_SH(loc3, (tmp_eve_buf + 8));
+
+ BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+ ST_SH(loc0, (tmp_eve_buf + 13 * 8));
+ ST_SH(loc1, (tmp_eve_buf + 2 * 8));
+ ST_SH(loc2, (tmp_eve_buf + 12 * 8));
+ ST_SH(loc3, (tmp_eve_buf + 3 * 8));
+
+ /* Store 8 */
+ BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+ ST_SH(loc0, (tmp_eve_buf + 11 * 8));
+ ST_SH(loc1, (tmp_eve_buf + 4 * 8));
+ ST_SH(loc2, (tmp_eve_buf + 10 * 8));
+ ST_SH(loc3, (tmp_eve_buf + 5 * 8));
+
+ BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+ ST_SH(loc0, (tmp_eve_buf + 9 * 8));
+ ST_SH(loc1, (tmp_eve_buf + 6 * 8));
+ ST_SH(loc2, (tmp_eve_buf + 8 * 8));
+ ST_SH(loc3, (tmp_eve_buf + 7 * 8));
+}
+
+static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
+ int16_t *tmp_odd_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+ /* Odd stage 1 */
+ reg0 = LD_SH(tmp_buf + 8);
+ reg1 = LD_SH(tmp_buf + 7 * 8);
+ reg2 = LD_SH(tmp_buf + 9 * 8);
+ reg3 = LD_SH(tmp_buf + 15 * 8);
+ reg4 = LD_SH(tmp_buf + 17 * 8);
+ reg5 = LD_SH(tmp_buf + 23 * 8);
+ reg6 = LD_SH(tmp_buf + 25 * 8);
+ reg7 = LD_SH(tmp_buf + 31 * 8);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+ vec0 = reg0 + reg3;
+ reg0 = reg0 - reg3;
+ reg3 = reg7 + reg4;
+ reg7 = reg7 - reg4;
+ reg4 = reg1 + reg2;
+ reg1 = reg1 - reg2;
+ reg2 = reg6 + reg5;
+ reg6 = reg6 - reg5;
+ reg5 = vec0;
+
+ /* 4 Stores */
+ ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
+
+ SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ ST_SH2(vec0, vec1, (tmp_odd_buf), 8);
+
+ /* 4 Stores */
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+ BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
+
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
+
+ /* Odd stage 2 */
+ /* 8 loads */
+ reg0 = LD_SH(tmp_buf + 3 * 8);
+ reg1 = LD_SH(tmp_buf + 5 * 8);
+ reg2 = LD_SH(tmp_buf + 11 * 8);
+ reg3 = LD_SH(tmp_buf + 13 * 8);
+ reg4 = LD_SH(tmp_buf + 19 * 8);
+ reg5 = LD_SH(tmp_buf + 21 * 8);
+ reg6 = LD_SH(tmp_buf + 27 * 8);
+ reg7 = LD_SH(tmp_buf + 29 * 8);
+
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+ /* 4 Stores */
+ SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+ BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
+
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
+
+ /* 4 Stores */
+ ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3);
+ BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+ ST_SH(reg0, (tmp_odd_buf + 13 * 8));
+ ST_SH(reg1, (tmp_odd_buf + 14 * 8));
+
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+ ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
+
+ /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+ /* Load 8 & Store 8 */
+ LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
+ LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
+
+ ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
+
+ SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
+
+ /* Load 8 & Store 8 */
+ LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
+ LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
+
+ ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
+
+ SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
+}
+
+static void idct_butterfly_transpose_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf,
+ int16_t *tmp_odd_buf, int16_t *dst) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+ /* FINAL BUTTERFLY : Dependency on Even & Odd */
+ vec0 = LD_SH(tmp_odd_buf);
+ vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+ loc0 = LD_SH(tmp_eve_buf);
+ loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+
+ ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
+ ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
+ ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
+ ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+
+ ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
+ ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
+ ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
+ ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+
+ ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
+ ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
+ ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
+ ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+
+ ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
+ ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
+ ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
+ ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
+
+ /* Transpose : 16 vectors */
+ /* 1st & 2nd 8x8 */
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
+ ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
+
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
+ ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
+
+ /* 3rd & 4th 8x8 */
+ LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
+ LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
+ ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
+
+ TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
+ ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
+}
+
+static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+ DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+ DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+ idct32x8_row_transpose_store(input, &tmp_buf[0]);
+ idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+ idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+ idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
+ output);
+}
+
+static void idct8x32_column_even_process_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+ /* Even stage 1 */
+ LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+ tmp_buf += (2 * 32);
+
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+ BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ loc1 = vec3;
+ loc0 = vec1;
+
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+ BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+ BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+ BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+ /* Even stage 2 */
+ /* Load 8 */
+ LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+ vec0 = reg0 + reg4;
+ reg0 = reg0 - reg4;
+ reg4 = reg6 + reg2;
+ reg6 = reg6 - reg2;
+ reg2 = reg1 + reg5;
+ reg1 = reg1 - reg5;
+ reg5 = reg7 + reg3;
+ reg7 = reg7 - reg3;
+ reg3 = vec0;
+
+ vec1 = reg2;
+ reg2 = reg3 + reg4;
+ reg3 = reg3 - reg4;
+ reg4 = reg5 - vec1;
+ reg5 = reg5 + vec1;
+
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+ vec0 = reg0 - reg6;
+ reg0 = reg0 + reg6;
+ vec1 = reg7 - reg1;
+ reg7 = reg7 + reg1;
+
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+ /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+ /* Store 8 */
+ BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+ ST_SH2(loc1, loc3, tmp_eve_buf, 8);
+ ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
+
+ BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+ ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
+ ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
+
+ /* Store 8 */
+ BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+ ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
+ ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
+
+ BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+ ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
+ ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
+}
+
+static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
+ int16_t *tmp_odd_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+ /* Odd stage 1 */
+ reg0 = LD_SH(tmp_buf + 32);
+ reg1 = LD_SH(tmp_buf + 7 * 32);
+ reg2 = LD_SH(tmp_buf + 9 * 32);
+ reg3 = LD_SH(tmp_buf + 15 * 32);
+ reg4 = LD_SH(tmp_buf + 17 * 32);
+ reg5 = LD_SH(tmp_buf + 23 * 32);
+ reg6 = LD_SH(tmp_buf + 25 * 32);
+ reg7 = LD_SH(tmp_buf + 31 * 32);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+ vec0 = reg0 + reg3;
+ reg0 = reg0 - reg3;
+ reg3 = reg7 + reg4;
+ reg7 = reg7 - reg4;
+ reg4 = reg1 + reg2;
+ reg1 = reg1 - reg2;
+ reg2 = reg6 + reg5;
+ reg6 = reg6 - reg5;
+ reg5 = vec0;
+
+ /* 4 Stores */
+ ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
+ SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ ST_SH2(vec0, vec1, tmp_odd_buf, 8);
+
+ /* 4 Stores */
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+ BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
+
+ /* Odd stage 2 */
+ /* 8 loads */
+ reg0 = LD_SH(tmp_buf + 3 * 32);
+ reg1 = LD_SH(tmp_buf + 5 * 32);
+ reg2 = LD_SH(tmp_buf + 11 * 32);
+ reg3 = LD_SH(tmp_buf + 13 * 32);
+ reg4 = LD_SH(tmp_buf + 19 * 32);
+ reg5 = LD_SH(tmp_buf + 21 * 32);
+ reg6 = LD_SH(tmp_buf + 27 * 32);
+ reg7 = LD_SH(tmp_buf + 29 * 32);
+
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+ /* 4 Stores */
+ SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+ BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+ ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
+
+ /* 4 Stores */
+ ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3);
+ BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+ ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+ ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
+
+ /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+ /* Load 8 & Store 8 */
+ LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
+ LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
+
+ ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
+
+ SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
+
+ /* Load 8 & Store 8 */
+ LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
+ LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
+
+ ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
+
+ SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
+}
+
+static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+ int16_t *tmp_odd_buf, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+ /* FINAL BUTTERFLY : Dependency on Even & Odd */
+ vec0 = LD_SH(tmp_odd_buf);
+ vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+ loc0 = LD_SH(tmp_eve_buf);
+ loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+ SRARI_H4_SH(m0, m2, m4, m6, 6);
+ VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
+
+ SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
+ SRARI_H4_SH(m0, m2, m4, m6, 6);
+ VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4,
+ m6);
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+ SRARI_H4_SH(m1, m3, m5, m7, 6);
+ VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7);
+
+ SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
+ SRARI_H4_SH(m1, m3, m5, m7, 6);
+ VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5,
+ m7);
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+ SRARI_H4_SH(n0, n2, n4, n6, 6);
+ VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6);
+
+ SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
+ SRARI_H4_SH(n0, n2, n4, n6, 6);
+ VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4,
+ n6);
+
+ /* Load 8 & Store 8 */
+ vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+ vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+ vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+ vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+ loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+ loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+ loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+ loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+ ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+ SRARI_H4_SH(n1, n3, n5, n7, 6);
+ VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7);
+
+ SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
+ SRARI_H4_SH(n1, n3, n5, n7, 6);
+ VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5,
+ n7);
+}
+
+static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+ DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+ idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+ idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+ idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
+ dst_stride);
+}
+
+void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+ int16_t *out_ptr = out_arr;
+
+ /* transform rows */
+ for (i = 0; i < 4; ++i) {
+ /* process 32 * 8 block */
+ idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 4; ++i) {
+ /* process 8 * 32 block */
+ idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+ int16_t *out_ptr = out_arr;
+
+ for (i = 32; i--;) {
+ __asm__ __volatile__(
+ "sw $zero, 0(%[out_ptr]) \n\t"
+ "sw $zero, 4(%[out_ptr]) \n\t"
+ "sw $zero, 8(%[out_ptr]) \n\t"
+ "sw $zero, 12(%[out_ptr]) \n\t"
+ "sw $zero, 16(%[out_ptr]) \n\t"
+ "sw $zero, 20(%[out_ptr]) \n\t"
+ "sw $zero, 24(%[out_ptr]) \n\t"
+ "sw $zero, 28(%[out_ptr]) \n\t"
+ "sw $zero, 32(%[out_ptr]) \n\t"
+ "sw $zero, 36(%[out_ptr]) \n\t"
+ "sw $zero, 40(%[out_ptr]) \n\t"
+ "sw $zero, 44(%[out_ptr]) \n\t"
+ "sw $zero, 48(%[out_ptr]) \n\t"
+ "sw $zero, 52(%[out_ptr]) \n\t"
+ "sw $zero, 56(%[out_ptr]) \n\t"
+ "sw $zero, 60(%[out_ptr]) \n\t"
+
+ :
+ : [out_ptr] "r"(out_ptr));
+
+ out_ptr += 32;
+ }
+
+ out_ptr = out_arr;
+
+ /* rows: only upper-left 8x8 has non-zero coeff */
+ idct32x8_1d_rows_msa(input, out_ptr);
+
+ /* transform columns */
+ for (i = 0; i < 4; ++i) {
+ /* process 8 * 32 block */
+ idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ int16_t out;
+ v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+ v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+ out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO(out, 6);
+
+ vec = __msa_fill_h(out);
+
+ for (i = 16; i--;) {
+ LD_UB2(dst, 16, dst0, dst1);
+ LD_UB2(dst + dst_stride, 16, dst2, dst3);
+
+ UNPCK_UB_SH(dst0, res0, res4);
+ UNPCK_UB_SH(dst1, res1, res5);
+ UNPCK_UB_SH(dst2, res2, res6);
+ UNPCK_UB_SH(dst3, res3, res7);
+ ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+ ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
+ CLIP_SH4_0_255(res0, res1, res2, res3);
+ CLIP_SH4_0_255(res4, res5, res6, res7);
+ PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
+ tmp2, tmp3);
+
+ ST_UB2(tmp0, tmp1, dst, 16);
+ dst += dst_stride;
+ ST_UB2(tmp2, tmp3, dst, 16);
+ dst += dst_stride;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c
new file mode 100644
index 0000000000..56ffec3cba
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 in0, in1, in2, in3;
+ v4i32 in0_r, in1_r, in2_r, in3_r, in4_r;
+
+ /* load vector elements of 4x4 block */
+ LD4x4_SH(input, in0, in2, in3, in1);
+ TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+ UNPCK_R_SH_SW(in0, in0_r);
+ UNPCK_R_SH_SW(in2, in2_r);
+ UNPCK_R_SH_SW(in3, in3_r);
+ UNPCK_R_SH_SW(in1, in1_r);
+ SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT);
+
+ in0_r += in2_r;
+ in3_r -= in1_r;
+ in4_r = (in0_r - in3_r) >> 1;
+ in1_r = in4_r - in1_r;
+ in2_r = in4_r - in2_r;
+ in0_r -= in1_r;
+ in3_r += in2_r;
+
+ TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r);
+
+ in0_r += in1_r;
+ in2_r -= in3_r;
+ in4_r = (in0_r - in2_r) >> 1;
+ in3_r = in4_r - in3_r;
+ in1_r = in4_r - in1_r;
+ in0_r -= in3_r;
+ in2_r += in1_r;
+
+ PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1,
+ in2, in3);
+ ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride);
+}
+
+void vpx_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int16_t a1, e1;
+ v8i16 in1, in0 = { 0 };
+
+ a1 = input[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+
+ in0 = __msa_insert_h(in0, 0, a1);
+ in0 = __msa_insert_h(in0, 1, e1);
+ in0 = __msa_insert_h(in0, 2, e1);
+ in0 = __msa_insert_h(in0, 3, e1);
+
+ in1 = in0 >> 1;
+ in0 -= in1;
+
+ ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride);
+}
+
+void vpx_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 in0, in1, in2, in3;
+
+ /* load vector elements of 4x4 block */
+ LD4x4_SH(input, in0, in1, in2, in3);
+ /* rows */
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ /* columns */
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ /* rounding (add 2^3, divide by 2^4) */
+ SRARI_H4_SH(in0, in1, in2, in3, 4);
+ ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+void vpx_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int16_t out;
+ v8i16 vec;
+
+ out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO(out, 4);
+ vec = __msa_fill_h(out);
+
+ ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c
new file mode 100644
index 0000000000..a383ff2066
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+ /* load vector elements of 8x8 block */
+ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+ /* rows transform */
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ /* 1D idct8x8 */
+ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ /* columns transform */
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ /* 1D idct8x8 */
+ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ /* final rounding (add 2^4, divide by 2^5) and shift */
+ SRARI_H4_SH(in0, in1, in2, in3, 5);
+ SRARI_H4_SH(in4, in5, in6, in7, 5);
+ /* add block and store 8x8 */
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+ dst += (4 * dst_stride);
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+void vpx_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
+ v4i32 tmp0, tmp1, tmp2, tmp3;
+ v8i16 zero = { 0 };
+
+ /* load vector elements of 8x8 block */
+ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+ /* stage1 */
+ ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
+ k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+ k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+ k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+ k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+ DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+ SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
+ PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+ PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+ BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5);
+
+ /* stage2 */
+ ILVR_H2_SH(in3, in1, in2, in0, s1, s0);
+ k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+ k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+ k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+ k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+ DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+ SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
+ PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+ PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+ BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3);
+
+ /* stage3 */
+ s0 = __msa_ilvr_h(s6, s5);
+
+ k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+ DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1);
+ SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS);
+ PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
+
+ /* stage4 */
+ BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6,
+ in7);
+ TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ /* final rounding (add 2^4, divide by 2^5) and shift */
+ SRARI_H4_SH(in0, in1, in2, in3, 5);
+ SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+ /* add block and store 8x8 */
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+ dst += (4 * dst_stride);
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+void vpx_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int16_t out;
+ int32_t val;
+ v8i16 vec;
+
+ out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+ val = ROUND_POWER_OF_TWO(out, 5);
+ vec = __msa_fill_h(val);
+
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+ dst += (4 * dst_stride);
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/intrapred16_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
new file mode 100644
index 0000000000..835e10e125
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ (void)above;
+
+ __asm__ __volatile__(
+ "lb %[tmp1], (%[left]) \n\t"
+ "lb %[tmp2], 1(%[left]) \n\t"
+ "lb %[tmp3], 2(%[left]) \n\t"
+ "lb %[tmp4], 3(%[left]) \n\t"
+ "lb %[tmp5], 4(%[left]) \n\t"
+ "lb %[tmp6], 5(%[left]) \n\t"
+ "lb %[tmp7], 6(%[left]) \n\t"
+ "lb %[tmp8], 7(%[left]) \n\t"
+ "lb %[tmp9], 8(%[left]) \n\t"
+ "lb %[tmp10], 9(%[left]) \n\t"
+ "lb %[tmp11], 10(%[left]) \n\t"
+ "lb %[tmp12], 11(%[left]) \n\t"
+ "lb %[tmp13], 12(%[left]) \n\t"
+ "lb %[tmp14], 13(%[left]) \n\t"
+ "lb %[tmp15], 14(%[left]) \n\t"
+ "lb %[tmp16], 15(%[left]) \n\t"
+
+ "replv.qb %[tmp1], %[tmp1] \n\t"
+ "replv.qb %[tmp2], %[tmp2] \n\t"
+ "replv.qb %[tmp3], %[tmp3] \n\t"
+ "replv.qb %[tmp4], %[tmp4] \n\t"
+ "replv.qb %[tmp5], %[tmp5] \n\t"
+ "replv.qb %[tmp6], %[tmp6] \n\t"
+ "replv.qb %[tmp7], %[tmp7] \n\t"
+ "replv.qb %[tmp8], %[tmp8] \n\t"
+ "replv.qb %[tmp9], %[tmp9] \n\t"
+ "replv.qb %[tmp10], %[tmp10] \n\t"
+ "replv.qb %[tmp11], %[tmp11] \n\t"
+ "replv.qb %[tmp12], %[tmp12] \n\t"
+ "replv.qb %[tmp13], %[tmp13] \n\t"
+ "replv.qb %[tmp14], %[tmp14] \n\t"
+ "replv.qb %[tmp15], %[tmp15] \n\t"
+ "replv.qb %[tmp16], %[tmp16] \n\t"
+
+ "sw %[tmp1], (%[dst]) \n\t"
+ "sw %[tmp1], 4(%[dst]) \n\t"
+ "sw %[tmp1], 8(%[dst]) \n\t"
+ "sw %[tmp1], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp2], (%[dst]) \n\t"
+ "sw %[tmp2], 4(%[dst]) \n\t"
+ "sw %[tmp2], 8(%[dst]) \n\t"
+ "sw %[tmp2], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp3], (%[dst]) \n\t"
+ "sw %[tmp3], 4(%[dst]) \n\t"
+ "sw %[tmp3], 8(%[dst]) \n\t"
+ "sw %[tmp3], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp4], (%[dst]) \n\t"
+ "sw %[tmp4], 4(%[dst]) \n\t"
+ "sw %[tmp4], 8(%[dst]) \n\t"
+ "sw %[tmp4], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp5], (%[dst]) \n\t"
+ "sw %[tmp5], 4(%[dst]) \n\t"
+ "sw %[tmp5], 8(%[dst]) \n\t"
+ "sw %[tmp5], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp6], (%[dst]) \n\t"
+ "sw %[tmp6], 4(%[dst]) \n\t"
+ "sw %[tmp6], 8(%[dst]) \n\t"
+ "sw %[tmp6], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp7], (%[dst]) \n\t"
+ "sw %[tmp7], 4(%[dst]) \n\t"
+ "sw %[tmp7], 8(%[dst]) \n\t"
+ "sw %[tmp7], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp8], (%[dst]) \n\t"
+ "sw %[tmp8], 4(%[dst]) \n\t"
+ "sw %[tmp8], 8(%[dst]) \n\t"
+ "sw %[tmp8], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp9], (%[dst]) \n\t"
+ "sw %[tmp9], 4(%[dst]) \n\t"
+ "sw %[tmp9], 8(%[dst]) \n\t"
+ "sw %[tmp9], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp10], (%[dst]) \n\t"
+ "sw %[tmp10], 4(%[dst]) \n\t"
+ "sw %[tmp10], 8(%[dst]) \n\t"
+ "sw %[tmp10], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp11], (%[dst]) \n\t"
+ "sw %[tmp11], 4(%[dst]) \n\t"
+ "sw %[tmp11], 8(%[dst]) \n\t"
+ "sw %[tmp11], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp12], (%[dst]) \n\t"
+ "sw %[tmp12], 4(%[dst]) \n\t"
+ "sw %[tmp12], 8(%[dst]) \n\t"
+ "sw %[tmp12], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp13], (%[dst]) \n\t"
+ "sw %[tmp13], 4(%[dst]) \n\t"
+ "sw %[tmp13], 8(%[dst]) \n\t"
+ "sw %[tmp13], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp14], (%[dst]) \n\t"
+ "sw %[tmp14], 4(%[dst]) \n\t"
+ "sw %[tmp14], 8(%[dst]) \n\t"
+ "sw %[tmp14], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp15], (%[dst]) \n\t"
+ "sw %[tmp15], 4(%[dst]) \n\t"
+ "sw %[tmp15], 8(%[dst]) \n\t"
+ "sw %[tmp15], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp16], (%[dst]) \n\t"
+ "sw %[tmp16], 4(%[dst]) \n\t"
+ "sw %[tmp16], 8(%[dst]) \n\t"
+ "sw %[tmp16], 12(%[dst]) \n\t"
+
+ : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+ [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+ [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9),
+ [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12),
+ [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15),
+ [tmp16] "=&r"(tmp16)
+ : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+ int32_t above2, left2;
+
+ __asm__ __volatile__(
+ "lw %[above1], (%[above]) \n\t"
+ "lw %[above2], 4(%[above]) \n\t"
+ "lw %[left1], (%[left]) \n\t"
+ "lw %[left2], 4(%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above1] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above1] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left1] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left1] \n\t"
+
+ "addu.ph %[average], %[above_r1], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above2] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above2] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left2] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left2] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[above_r1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "lw %[above1], 8(%[above]) \n\t"
+ "lw %[above2], 12(%[above]) \n\t"
+ "lw %[left1], 8(%[left]) \n\t"
+ "lw %[left2], 12(%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above1] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above1] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left1] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left1] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[above_r1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above2] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above2] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left2] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left2] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[above_r1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "addiu %[average], %[average], 16 \n\t"
+ "srl %[tmp], %[average], 16 \n\t"
+ "addu.ph %[average], %[tmp], %[average] \n\t"
+ "srl %[expected_dc], %[average], 5 \n\t"
+ "replv.qb %[expected_dc], %[expected_dc] \n\t"
+
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1),
+ [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1),
+ [above_r1] "=&r"(above_r1), [above2] "=&r"(above2),
+ [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp),
+ [expected_dc] "=&r"(expected_dc)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride));
+}
+#endif // #if HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/intrapred4_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
new file mode 100644
index 0000000000..dce03a2b2a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t tmp1, tmp2, tmp3, tmp4;
+ (void)above;
+
+ __asm__ __volatile__(
+ "lb %[tmp1], (%[left]) \n\t"
+ "lb %[tmp2], 1(%[left]) \n\t"
+ "lb %[tmp3], 2(%[left]) \n\t"
+ "lb %[tmp4], 3(%[left]) \n\t"
+ "replv.qb %[tmp1], %[tmp1] \n\t"
+ "replv.qb %[tmp2], %[tmp2] \n\t"
+ "replv.qb %[tmp3], %[tmp3] \n\t"
+ "replv.qb %[tmp4], %[tmp4] \n\t"
+ "sw %[tmp1], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp2], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp3], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp4], (%[dst]) \n\t"
+
+ : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+ [tmp4] "=&r"(tmp4)
+ : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
+
+ __asm__ __volatile__(
+ "lw %[above_c], (%[above]) \n\t"
+ "lw %[left_c], (%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l], %[above_c] \n\t"
+ "preceu.ph.qbr %[above_r], %[above_c] \n\t"
+ "preceu.ph.qbl %[left_l], %[left_c] \n\t"
+ "preceu.ph.qbr %[left_r], %[left_c] \n\t"
+
+ "addu.ph %[average], %[above_r], %[above_l] \n\t"
+ "addu.ph %[average], %[average], %[left_l] \n\t"
+ "addu.ph %[average], %[average], %[left_r] \n\t"
+ "addiu %[average], %[average], 4 \n\t"
+ "srl %[tmp], %[average], 16 \n\t"
+ "addu.ph %[average], %[tmp], %[average] \n\t"
+ "srl %[expected_dc], %[average], 3 \n\t"
+ "replv.qb %[expected_dc], %[expected_dc] \n\t"
+
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+
+ : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l),
+ [above_r] "=&r"(above_r), [left_c] "=&r"(left_c),
+ [left_l] "=&r"(left_l), [left_r] "=&r"(left_r),
+ [average] "=&r"(average), [tmp] "=&r"(tmp),
+ [expected_dc] "=&r"(expected_dc)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride));
+}
+
+void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t abovel, abover;
+ int32_t left0, left1, left2, left3;
+ int32_t res0, res1;
+ int32_t resl;
+ int32_t resr;
+ int32_t top_left;
+ uint8_t *cm = vpx_ff_cropTbl;
+
+ __asm__ __volatile__(
+ "ulw %[resl], (%[above]) \n\t"
+
+ "lbu %[left0], (%[left]) \n\t"
+ "lbu %[left1], 1(%[left]) \n\t"
+ "lbu %[left2], 2(%[left]) \n\t"
+ "lbu %[left3], 3(%[left]) \n\t"
+
+ "lbu %[top_left], -1(%[above]) \n\t"
+
+ "preceu.ph.qbl %[abovel], %[resl] \n\t"
+ "preceu.ph.qbr %[abover], %[resl] \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "replv.ph %[left1], %[left1] \n\t"
+ "replv.ph %[left2], %[left2] \n\t"
+ "replv.ph %[left3], %[left3] \n\t"
+
+ "replv.ph %[top_left], %[top_left] \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left0] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left0] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left1] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left1] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left2] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left2] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left3] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left3] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0),
+ [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0),
+ [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl),
+ [resr] "=&r"(resr), [top_left] "=&r"(top_left)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride), [cm] "r"(cm));
+}
+#endif // #if HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/intrapred8_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
new file mode 100644
index 0000000000..16e7fc5507
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ (void)above;
+
+ __asm__ __volatile__(
+ "lb %[tmp1], (%[left]) \n\t"
+ "lb %[tmp2], 1(%[left]) \n\t"
+ "lb %[tmp3], 2(%[left]) \n\t"
+ "lb %[tmp4], 3(%[left]) \n\t"
+ "lb %[tmp5], 4(%[left]) \n\t"
+ "lb %[tmp6], 5(%[left]) \n\t"
+ "lb %[tmp7], 6(%[left]) \n\t"
+ "lb %[tmp8], 7(%[left]) \n\t"
+
+ "replv.qb %[tmp1], %[tmp1] \n\t"
+ "replv.qb %[tmp2], %[tmp2] \n\t"
+ "replv.qb %[tmp3], %[tmp3] \n\t"
+ "replv.qb %[tmp4], %[tmp4] \n\t"
+ "replv.qb %[tmp5], %[tmp5] \n\t"
+ "replv.qb %[tmp6], %[tmp6] \n\t"
+ "replv.qb %[tmp7], %[tmp7] \n\t"
+ "replv.qb %[tmp8], %[tmp8] \n\t"
+
+ "sw %[tmp1], (%[dst]) \n\t"
+ "sw %[tmp1], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp2], (%[dst]) \n\t"
+ "sw %[tmp2], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp3], (%[dst]) \n\t"
+ "sw %[tmp3], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp4], (%[dst]) \n\t"
+ "sw %[tmp4], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp5], (%[dst]) \n\t"
+ "sw %[tmp5], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp6], (%[dst]) \n\t"
+ "sw %[tmp6], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp7], (%[dst]) \n\t"
+ "sw %[tmp7], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp8], (%[dst]) \n\t"
+ "sw %[tmp8], 4(%[dst]) \n\t"
+
+ : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+ [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+ [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8)
+ : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+ int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
+
+ __asm__ __volatile__(
+ "lw %[above1], (%[above]) \n\t"
+ "lw %[above2], 4(%[above]) \n\t"
+ "lw %[left1], (%[left]) \n\t"
+ "lw %[left2], 4(%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above1] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above1] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left1] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left1] \n\t"
+
+ "preceu.ph.qbl %[above_l2], %[above2] \n\t"
+ "preceu.ph.qbr %[above_r2], %[above2] \n\t"
+ "preceu.ph.qbl %[left_l2], %[left2] \n\t"
+ "preceu.ph.qbr %[left_r2], %[left2] \n\t"
+
+ "addu.ph %[average], %[above_r1], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l2] \n\t"
+ "addu.ph %[average], %[average], %[above_r2] \n\t"
+ "addu.ph %[average], %[average], %[left_l2] \n\t"
+ "addu.ph %[average], %[average], %[left_r2] \n\t"
+
+ "addiu %[average], %[average], 8 \n\t"
+
+ "srl %[tmp], %[average], 16 \n\t"
+ "addu.ph %[average], %[tmp], %[average] \n\t"
+ "srl %[expected_dc], %[average], 4 \n\t"
+ "replv.qb %[expected_dc], %[expected_dc] \n\t"
+
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1),
+ [above_r1] "=&r"(above_r1), [left1] "=&r"(left1),
+ [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1),
+ [above2] "=&r"(above2), [above_l2] "=&r"(above_l2),
+ [above_r2] "=&r"(above_r2), [left2] "=&r"(left2),
+ [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2),
+ [average] "=&r"(average), [tmp] "=&r"(tmp),
+ [expected_dc] "=&r"(expected_dc)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride));
+}
+
+void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t abovel, abover;
+ int32_t abovel_1, abover_1;
+ int32_t left0;
+ int32_t res0, res1, res2, res3;
+ int32_t reshw;
+ int32_t top_left;
+ uint8_t *cm = vpx_ff_cropTbl;
+
+ __asm__ __volatile__(
+ "ulw %[reshw], (%[above]) \n\t"
+ "ulw %[top_left], 4(%[above]) \n\t"
+
+ "lbu %[left0], (%[left]) \n\t"
+
+ "preceu.ph.qbl %[abovel], %[reshw] \n\t"
+ "preceu.ph.qbr %[abover], %[reshw] \n\t"
+ "preceu.ph.qbl %[abovel_1], %[top_left] \n\t"
+ "preceu.ph.qbr %[abover_1], %[top_left] \n\t"
+
+ "lbu %[top_left], -1(%[above]) \n\t"
+ "replv.ph %[left0], %[left0] \n\t"
+
+ "replv.ph %[top_left], %[top_left] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 1(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 2(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 3(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 4(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 5(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 6(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 7(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ : [abovel] "=&r"(abovel), [abover] "=&r"(abover),
+ [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1),
+ [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3),
+ [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw),
+ [top_left] "=&r"(top_left)
+ : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+ [stride] "r"(stride), [cm] "r"(cm));
+}
+#endif // #if HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/intrapred_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/intrapred_msa.c
new file mode 100644
index 0000000000..b5ee943031
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/intrapred_msa.c
@@ -0,0 +1,738 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
+ { \
+ out0 = __msa_subs_u_h(out0, in0); \
+ out1 = __msa_subs_u_h(out1, in1); \
+ }
+
+static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t src_data;
+
+ src_data = LW(src);
+
+ SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
+}
+
+static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ uint32_t src_data1, src_data2;
+
+ src_data1 = LW(src);
+ src_data2 = LW(src + 4);
+
+ for (row = 8; row--;) {
+ SW(src_data1, dst);
+ SW(src_data2, (dst + 4));
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ v16u8 src0;
+
+ src0 = LD_UB(src);
+
+ for (row = 16; row--;) {
+ ST_UB(src0, dst);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ v16u8 src1, src2;
+
+ src1 = LD_UB(src);
+ src2 = LD_UB(src + 16);
+
+ for (row = 32; row--;) {
+ ST_UB2(src1, src2, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t out0, out1, out2, out3;
+
+ out0 = src[0] * 0x01010101;
+ out1 = src[1] * 0x01010101;
+ out2 = src[2] * 0x01010101;
+ out3 = src[3] * 0x01010101;
+
+ SW4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ out0 = src[0] * 0x0101010101010101ull;
+ out1 = src[1] * 0x0101010101010101ull;
+ out2 = src[2] * 0x0101010101010101ull;
+ out3 = src[3] * 0x0101010101010101ull;
+ out4 = src[4] * 0x0101010101010101ull;
+ out5 = src[5] * 0x0101010101010101ull;
+ out6 = src[6] * 0x0101010101010101ull;
+ out7 = src[7] * 0x0101010101010101ull;
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ uint8_t inp0, inp1, inp2, inp3;
+ v16u8 src0, src1, src2, src3;
+
+ for (row = 4; row--;) {
+ inp0 = src[0];
+ inp1 = src[1];
+ inp2 = src[2];
+ inp3 = src[3];
+ src += 4;
+
+ src0 = (v16u8)__msa_fill_b(inp0);
+ src1 = (v16u8)__msa_fill_b(inp1);
+ src2 = (v16u8)__msa_fill_b(inp2);
+ src3 = (v16u8)__msa_fill_b(inp3);
+
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ uint8_t inp0, inp1, inp2, inp3;
+ v16u8 src0, src1, src2, src3;
+
+ for (row = 8; row--;) {
+ inp0 = src[0];
+ inp1 = src[1];
+ inp2 = src[2];
+ inp3 = src[3];
+ src += 4;
+
+ src0 = (v16u8)__msa_fill_b(inp0);
+ src1 = (v16u8)__msa_fill_b(inp1);
+ src2 = (v16u8)__msa_fill_b(inp2);
+ src3 = (v16u8)__msa_fill_b(inp3);
+
+ ST_UB2(src0, src0, dst, 16);
+ dst += dst_stride;
+ ST_UB2(src1, src1, dst, 16);
+ dst += dst_stride;
+ ST_UB2(src2, src2, dst, 16);
+ dst += dst_stride;
+ ST_UB2(src3, src3, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t val0, val1;
+ v16i8 store, src = { 0 };
+ v8u16 sum_h;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ val0 = LW(src_top);
+ val1 = LW(src_left);
+ INSERT_W2_SB(val0, val1, src);
+ sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+ store = __msa_splati_b((v16i8)sum_w, 0);
+ val0 = __msa_copy_u_w((v4i32)store, 0);
+
+ SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t val0;
+ v16i8 store, data = { 0 };
+ v8u16 sum_h;
+ v4u32 sum_w;
+
+ val0 = LW(src);
+ data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
+ sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
+ store = __msa_splati_b((v16i8)sum_w, 0);
+ val0 = __msa_copy_u_w((v4i32)store, 0);
+
+ SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
+ uint32_t out;
+ const v16i8 store = __msa_ldi_b(128);
+
+ out = __msa_copy_u_w((v4i32)store, 0);
+
+ SW4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint64_t val0, val1;
+ v16i8 store;
+ v16u8 src = { 0 };
+ v8u16 sum_h;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ val0 = LD(src_top);
+ val1 = LD(src_left);
+ INSERT_D2_UB(val0, val1, src);
+ sum_h = __msa_hadd_u_h(src, src);
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+ store = __msa_splati_b((v16i8)sum_w, 0);
+ val0 = __msa_copy_u_d((v2i64)store, 0);
+
+ SD4(val0, val0, val0, val0, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint64_t val0;
+ v16i8 store;
+ v16u8 data = { 0 };
+ v8u16 sum_h;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ val0 = LD(src);
+ data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
+ sum_h = __msa_hadd_u_h(data, data);
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+ store = __msa_splati_b((v16i8)sum_w, 0);
+ val0 = __msa_copy_u_d((v2i64)store, 0);
+
+ SD4(val0, val0, val0, val0, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
+ uint64_t out;
+ const v16i8 store = __msa_ldi_b(128);
+
+ out = __msa_copy_u_d((v2i64)store, 0);
+
+ SD4(out, out, out, out, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ v16u8 top, left, out;
+ v8u16 sum_h, sum_top, sum_left;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ top = LD_UB(src_top);
+ left = LD_UB(src_left);
+ HADD_UB2_UH(top, left, sum_top, sum_left);
+ sum_h = sum_top + sum_left;
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+ out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+ dst += (8 * dst_stride);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ v16u8 data, out;
+ v8u16 sum_h;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ data = LD_UB(src);
+ sum_h = __msa_hadd_u_h(data, data);
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+ out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+ dst += (8 * dst_stride);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
+ const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+ dst += (8 * dst_stride);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ v16u8 top0, top1, left0, left1, out;
+ v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ LD_UB2(src_top, 16, top0, top1);
+ LD_UB2(src_left, 16, left0, left1);
+ HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
+ HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
+ sum_h = sum_top0 + sum_top1;
+ sum_h += sum_left0 + sum_left1;
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
+ out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+ for (row = 16; row--;) {
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t row;
+ v16u8 data0, data1, out;
+ v8u16 sum_h, sum_data0, sum_data1;
+ v4u32 sum_w;
+ v2u64 sum_d;
+
+ LD_UB2(src, 16, data0, data1);
+ HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
+ sum_h = sum_data0 + sum_data1;
+ sum_w = __msa_hadd_u_w(sum_h, sum_h);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+ sum_d = __msa_hadd_u_d(sum_w, sum_w);
+ sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+ out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+ for (row = 16; row--;) {
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
+ uint32_t row;
+ const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+ for (row = 16; row--;) {
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ ST_UB2(out, out, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint32_t val;
+ uint8_t top_left = src_top_ptr[-1];
+ v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+ v16u8 src0, src1, src2, src3;
+ v8u16 src_top_left, vec0, vec1, vec2, vec3;
+
+ src_top_left = (v8u16)__msa_fill_h(top_left);
+ val = LW(src_top_ptr);
+ src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
+
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left1 = __msa_fill_b(src_left[1]);
+ src_left2 = __msa_fill_b(src_left[2]);
+ src_left3 = __msa_fill_b(src_left[3]);
+
+ ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+ src_left3, src_top, src0, src1, src2, src3);
+ HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+ SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+ ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint64_t val;
+ uint8_t top_left = src_top_ptr[-1];
+ uint32_t loop_cnt;
+ v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+ v8u16 src_top_left, vec0, vec1, vec2, vec3;
+ v16u8 src0, src1, src2, src3;
+
+ val = LD(src_top_ptr);
+ src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
+ src_top_left = (v8u16)__msa_fill_h(top_left);
+
+ for (loop_cnt = 2; loop_cnt--;) {
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left1 = __msa_fill_b(src_left[1]);
+ src_left2 = __msa_fill_b(src_left[2]);
+ src_left3 = __msa_fill_b(src_left[3]);
+ src_left += 4;
+
+ ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+ src_left3, src_top, src0, src1, src2, src3);
+ HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+ SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint8_t top_left = src_top_ptr[-1];
+ uint32_t loop_cnt;
+ v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
+ v8u16 src_top_left, res_r, res_l;
+
+ src_top = LD_SB(src_top_ptr);
+ src_top_left = (v8u16)__msa_fill_h(top_left);
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left1 = __msa_fill_b(src_left[1]);
+ src_left2 = __msa_fill_b(src_left[2]);
+ src_left3 = __msa_fill_b(src_left[3]);
+ src_left += 4;
+
+ ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+
+ ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+
+ ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+
+ ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
+ const uint8_t *src_left, uint8_t *dst,
+ int32_t dst_stride) {
+ uint8_t top_left = src_top[-1];
+ uint32_t loop_cnt;
+ v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
+ v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
+
+ LD_SB2(src_top, 16, src_top0, src_top1);
+ src_top_left = (v8u16)__msa_fill_h(top_left);
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left1 = __msa_fill_b(src_left[1]);
+ src_left2 = __msa_fill_b(src_left[2]);
+ src_left3 = __msa_fill_b(src_left[3]);
+ src_left += 4;
+
+ ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+
+ ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+
+ ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+
+ ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_vert_4x4_msa(above, dst, y_stride);
+}
+
+void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_vert_8x8_msa(above, dst, y_stride);
+}
+
+void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_vert_16x16_msa(above, dst, y_stride);
+}
+
+void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_vert_32x32_msa(above, dst, y_stride);
+}
+
+void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_horiz_4x4_msa(left, dst, y_stride);
+}
+
+void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_horiz_8x8_msa(left, dst, y_stride);
+}
+
+void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_horiz_16x16_msa(left, dst, y_stride);
+}
+
+void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_horiz_32x32_msa(left, dst, y_stride);
+}
+
+void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_4x4_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_8x8_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_16x16_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_32x32_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+
+ intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+
+ intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+
+ intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
+}
+
+void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+
+ intra_predict_128dc_4x4_msa(dst, y_stride);
+}
+
+void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+
+ intra_predict_128dc_8x8_msa(dst, y_stride);
+}
+
+void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+
+ intra_predict_128dc_16x16_msa(dst, y_stride);
+}
+
+void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+
+ intra_predict_128dc_32x32_msa(dst, y_stride);
+}
+
+void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_tm_4x4_msa(above, left, dst, y_stride);
+}
+
+void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_tm_8x8_msa(above, left, dst, y_stride);
+}
+
+void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_tm_16x16_msa(above, left, dst, y_stride);
+}
+
+void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_tm_32x32_msa(above, left, dst, y_stride);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
new file mode 100644
index 0000000000..cbea22f20f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \
+ ({ \
+ int32_t tmp, out; \
+ int dct_cost_rounding = DCT_CONST_ROUNDING; \
+ int in = input; \
+ \
+ __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \
+ "mtlo %[dct_cost_rounding], $ac1 " \
+ " \n\t" \
+ "mthi $zero, $ac1 " \
+ " \n\t" \
+ "madd $ac1, %[in], " \
+ "%[cospi_16_64] \n\t" \
+ "extp %[tmp], $ac1, " \
+ "31 \n\t" \
+ \
+ /* out = dct_const_round_shift(out * cospi_16_64); */ \
+ "mtlo %[dct_cost_rounding], $ac2 " \
+ " \n\t" \
+ "mthi $zero, $ac2 " \
+ " \n\t" \
+ "madd $ac2, %[tmp], " \
+ "%[cospi_16_64] \n\t" \
+ "extp %[out], $ac2, " \
+ "31 \n\t" \
+ \
+ : [tmp] "=&r"(tmp), [out] "=r"(out) \
+ : [in] "r"(in), \
+ [dct_cost_rounding] "r"(dct_cost_rounding), \
+ [cospi_16_64] "r"(cospi_16_64)); \
+ out; \
+ })
+
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
+void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output);
+void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
+void iadst4_dspr2(const int16_t *input, int16_t *output);
+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
+void iadst8_dspr2(const int16_t *input, int16_t *output);
+void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
+void iadst16_dspr2(const int16_t *input, int16_t *output);
+
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h
new file mode 100644
index 0000000000..3b66249ef2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_
+#define VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/mips/txfm_macros_msa.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
+ v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
+ cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
+ v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \
+ cospi_24_64, -cospi_24_64, 0, 0 }; \
+ \
+ SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
+ cnst2_m = -cnst0_m; \
+ ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
+ SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \
+ cnst4_m = -cnst2_m; \
+ ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
+ \
+ ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
+ cnst2_m, cnst3_m, in7, in0, in4, in3); \
+ \
+ SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
+ cnst2_m = -cnst0_m; \
+ ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
+ SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \
+ cnst4_m = -cnst2_m; \
+ ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
+ \
+ ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
+ \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
+ cnst2_m, cnst3_m, in5, in2, in6, in1); \
+ BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
+ out7 = -s0_m; \
+ out0 = s1_m; \
+ \
+ SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
+ \
+ ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ cnst1_m = cnst0_m; \
+ \
+ ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \
+ cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \
+ \
+ SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
+ ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
+ \
+ out1 = -out1; \
+ out3 = -out3; \
+ out5 = -out5; \
+ }
+
+#define VP9_SET_COSPI_PAIR(c0_h, c1_h) \
+ ({ \
+ v8i16 out0_m, r0_m, r1_m; \
+ \
+ r0_m = __msa_fill_h(c0_h); \
+ r1_m = __msa_fill_h(c1_h); \
+ out0_m = __msa_ilvev_h(r1_m, r0_m); \
+ \
+ out0_m; \
+ })
+
+#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \
+ { \
+ uint8_t *dst_m = (uint8_t *)(dst); \
+ v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
+ v16i8 tmp0_m, tmp1_m; \
+ v16i8 zero_m = { 0 }; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ \
+ LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \
+ ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \
+ res0_m, res1_m, res2_m, res3_m); \
+ ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \
+ res2_m, res3_m); \
+ CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
+ PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
+ }
+
+#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 c0_m, c1_m, c2_m, c3_m; \
+ v8i16 step0_m, step1_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
+ c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
+ step0_m = __msa_ilvr_h(in2, in0); \
+ DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \
+ \
+ c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
+ c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
+ step1_m = __msa_ilvr_h(in3, in1); \
+ DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \
+ SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ \
+ PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \
+ SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \
+ BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \
+ out0, out1, out2, out3); \
+ }
+
+#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 res0_m, res1_m, c0_m, c1_m; \
+ v8i16 k1_m, k2_m, k3_m, k4_m; \
+ v8i16 zero_m = { 0 }; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v4i32 int0_m, int1_m, int2_m, int3_m; \
+ v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \
+ -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \
+ \
+ SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \
+ ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \
+ ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
+ DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \
+ int0_m = tmp2_m + tmp1_m; \
+ \
+ SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \
+ ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \
+ DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
+ int1_m = tmp0_m + tmp1_m; \
+ \
+ c0_m = __msa_splati_h(mask_m, 6); \
+ ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \
+ ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
+ DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
+ int2_m = tmp0_m + tmp1_m; \
+ \
+ c0_m = __msa_splati_h(mask_m, 6); \
+ c0_m = __msa_ilvev_h(c0_m, k1_m); \
+ \
+ res0_m = __msa_ilvr_h((in1), (in3)); \
+ tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \
+ int3_m = tmp2_m + tmp0_m; \
+ \
+ res0_m = __msa_ilvr_h((in2), (in3)); \
+ c1_m = __msa_ilvev_h(k4_m, k3_m); \
+ \
+ tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \
+ res1_m = __msa_ilvr_h((in0), (in2)); \
+ c1_m = __msa_ilvev_h(k1_m, zero_m); \
+ \
+ tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \
+ int3_m += tmp2_m; \
+ int3_m += tmp3_m; \
+ \
+ SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \
+ PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \
+ }
+
+#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \
+ ({ \
+ v8i16 c0_m, c1_m; \
+ \
+ SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
+ c0_m = __msa_ilvev_h(c1_m, c0_m); \
+ \
+ c0_m; \
+ })
+
+/* multiply and add macro */
+#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \
+ out2, out3) \
+ { \
+ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
+ v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \
+ \
+ ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \
+ ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \
+ DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
+ cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \
+ SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \
+ PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \
+ DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
+ cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \
+ SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \
+ PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \
+ }
+
+/* idct 8x8 macro */
+#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \
+ v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \
+ cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \
+ \
+ k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \
+ k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \
+ k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \
+ k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \
+ VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
+ SUB2(in1, in3, in7, in5, res0_m, res1_m); \
+ k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \
+ k1_m = __msa_splati_h(mask_m, 4); \
+ \
+ ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \
+ DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \
+ tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ tp4_m = in1 + in3; \
+ PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \
+ tp7_m = in7 + in5; \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
+ k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
+ VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \
+ BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \
+ BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \
+ out1, out2, out3, out4, out5, out6, out7); \
+ }
+
+#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \
+ v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \
+ v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \
+ cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \
+ v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \
+ -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \
+ v8i16 mask3_m = { \
+ -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \
+ }; \
+ \
+ k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \
+ k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \
+ ILVRL_H2_SH(in1, in0, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
+ r1_m, r2_m, r3_m); \
+ k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \
+ k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \
+ ILVRL_H2_SH(in5, in4, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
+ r5_m, r6_m, r7_m); \
+ ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \
+ SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \
+ k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \
+ k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \
+ ILVRL_H2_SH(in3, in2, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
+ r1_m, r2_m, r3_m); \
+ k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \
+ k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \
+ ILVRL_H2_SH(in7, in6, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
+ r5_m, r6_m, r7_m); \
+ ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \
+ SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \
+ ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \
+ BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \
+ k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \
+ k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \
+ ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
+ r1_m, r2_m, r3_m); \
+ k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \
+ DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \
+ r6_m, r7_m); \
+ ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \
+ SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
+ m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \
+ k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \
+ k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \
+ ILVRL_H2_SH(in4, in3, in_s1, in_s0); \
+ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \
+ m1_m, m2_m, m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \
+ ILVRL_H2_SW(in5, in2, m2_m, m3_m); \
+ DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \
+ m2_m, m3_m); \
+ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \
+ \
+ out1 = -in1; \
+ out3 = -in3; \
+ out5 = -in5; \
+ out7 = -in7; \
+ }
+
+#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \
+ r12, r13, r14, r15, out0, out1, out2, out3, out4, \
+ out5, out6, out7, out8, out9, out10, out11, out12, \
+ out13, out14, out15) \
+ { \
+ v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
+ v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
+ v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
+ v8i16 h8_m, h9_m, h10_m, h11_m; \
+ v8i16 k0_m, k1_m, k2_m, k3_m; \
+ \
+ /* stage 1 */ \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
+ k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
+ MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
+ k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
+ MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
+ k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
+ MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \
+ g11_m); \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
+ k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
+ MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \
+ g15_m); \
+ \
+ /* stage 2 */ \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
+ k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
+ MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \
+ h3_m); \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
+ k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
+ MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \
+ h6_m, h7_m); \
+ BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
+ BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \
+ h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
+ \
+ /* stage 3 */ \
+ BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
+ k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
+ k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
+ MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \
+ out7); \
+ MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \
+ out13, out15); \
+ \
+ /* stage 4 */ \
+ k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
+ k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
+ k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
+ k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
+ MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
+ MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
+ MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
+ MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
+ }
+
+void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+ int32_t dst_stride);
+void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
+void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+ int32_t dst_stride);
+void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
+#endif // VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans16_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans16_dspr2.c
new file mode 100644
index 0000000000..44ba65c7ac
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans16_dspr2.c
@@ -0,0 +1,1230 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+ uint32_t no_rows) {
+ int i;
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int step1_10, step1_11, step1_12, step1_13;
+ int step2_0, step2_1, step2_2, step2_3;
+ int step2_8, step2_9, step2_10, step2_11;
+ int step2_12, step2_13, step2_14, step2_15;
+ int load1, load2, load3, load4, load5, load6, load7, load8;
+ int result1, result2, result3, result4;
+ const int const_2_power_13 = 8192;
+
+ for (i = no_rows; i--;) {
+ /* prefetch row */
+ prefetch_load((const uint8_t *)(input + 16));
+
+ __asm__ __volatile__(
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 16(%[input]) \n\t"
+ "lh %[load3], 8(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[step2_0], $ac1, 31 \n\t"
+ "extp %[step2_1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[step2_2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[step2_3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[step2_0], %[step2_3] \n\t"
+ "add %[step1_1], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
+ [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
+ [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
+ [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+ [step1_3] "=r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "lh %[load5], 2(%[input]) \n\t"
+ "lh %[load6], 30(%[input]) \n\t"
+ "lh %[load7], 18(%[input]) \n\t"
+ "lh %[load8], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_2_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_14_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_18_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_14_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_2_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_30_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "sub %[load5], %[result1], %[result2] \n\t"
+ "sub %[load6], %[result4], %[result3] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load6], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load5], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load5], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[result1], %[result2] \n\t"
+ "add %[step2_15], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
+ [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
+ [step2_14] "=r"(step2_14)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 22(%[input]) \n\t"
+ "lh %[load3], 26(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_26_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_22_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac2, %[load4], %[cospi_6_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[result2], %[result1] \n\t"
+ "sub %[load2], %[result4], %[result3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[result1], %[result2] \n\t"
+ "add %[step2_12], %[result4], %[result3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
+ [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+ [step2_13] "=r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "lh %[load5], 4(%[input]) \n\t"
+ "lh %[load6], 28(%[input]) \n\t"
+ "lh %[load7], 20(%[input]) \n\t"
+ "lh %[load8], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_4_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_12_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_20_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_12_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_4_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_28_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[result4], %[result3] \n\t"
+ "sub %[load5], %[load5], %[result1] \n\t"
+ "add %[load5], %[load5], %[result2] \n\t"
+
+ "sub %[load6], %[result1], %[result2] \n\t"
+ "sub %[load6], %[load6], %[result3] \n\t"
+ "add %[load6], %[load6], %[result4] \n\t"
+
+ "madd $ac1, %[load5], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[result1], %[result2] \n\t"
+ "add %[step1_7], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
+ [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+ [step1_7] "=r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "sub %[load5], %[step2_14], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+
+ "madd $ac0, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_14], %[step2_13] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "add %[load6], %[load6], %[step2_9] \n\t"
+
+ "madd $ac1, %[load6], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[step2_15], %[step2_12] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "add %[load5], %[load5], %[step2_11] \n\t"
+
+ "madd $ac2, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_15], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_11] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_10], $ac0, 31 \n\t"
+ "extp %[step1_13], $ac1, 31 \n\t"
+ "extp %[step1_11], $ac2, 31 \n\t"
+ "extp %[step1_12], $ac3, 31 \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
+ [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
+ [step1_13] "=r"(step1_13)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
+ [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
+ [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
+ [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
+ [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "add %[load5], %[step1_0], %[step1_7] \n\t"
+ "add %[load5], %[load5], %[step2_12] \n\t"
+ "add %[load5], %[load5], %[step2_15] \n\t"
+ "add %[load6], %[step1_1], %[step1_6] \n\t"
+ "add %[load6], %[load6], %[step2_13] \n\t"
+ "add %[load6], %[load6], %[step2_14] \n\t"
+ "sh %[load5], 0(%[output]) \n\t"
+ "sh %[load6], 32(%[output]) \n\t"
+ "sub %[load5], %[step1_1], %[step1_6] \n\t"
+ "add %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+ "sub %[load6], %[step1_0], %[step1_7] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+ "add %[load6], %[load6], %[step2_11] \n\t"
+ "sh %[load5], 192(%[output]) \n\t"
+ "sh %[load6], 224(%[output]) \n\t"
+ "sub %[load5], %[step1_0], %[step1_7] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "sub %[load5], %[load5], %[step2_11] \n\t"
+ "sub %[load6], %[step1_1], %[step1_6] \n\t"
+ "sub %[load6], %[load6], %[step2_9] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "sh %[load5], 256(%[output]) \n\t"
+ "sh %[load6], 288(%[output]) \n\t"
+ "add %[load5], %[step1_1], %[step1_6] \n\t"
+ "sub %[load5], %[load5], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_14] \n\t"
+ "add %[load6], %[step1_0], %[step1_7] \n\t"
+ "sub %[load6], %[load6], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_15] \n\t"
+ "sh %[load5], 448(%[output]) \n\t"
+ "sh %[load6], 480(%[output]) \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6)
+ : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
+ [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
+ [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
+ [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
+ [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
+ [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
+
+ __asm__ __volatile__(
+ "add %[load5], %[step1_2], %[step1_5] \n\t"
+ "add %[load5], %[load5], %[step1_13] \n\t"
+ "add %[load6], %[step1_3], %[step1_4] \n\t"
+ "add %[load6], %[load6], %[step1_12] \n\t"
+ "sh %[load5], 64(%[output]) \n\t"
+ "sh %[load6], 96(%[output]) \n\t"
+ "sub %[load5], %[step1_3], %[step1_4] \n\t"
+ "add %[load5], %[load5], %[step1_11] \n\t"
+ "sub %[load6], %[step1_2], %[step1_5] \n\t"
+ "add %[load6], %[load6], %[step1_10] \n\t"
+ "sh %[load5], 128(%[output]) \n\t"
+ "sh %[load6], 160(%[output]) \n\t"
+ "sub %[load5], %[step1_2], %[step1_5] \n\t"
+ "sub %[load5], %[load5], %[step1_10] \n\t"
+ "sub %[load6], %[step1_3], %[step1_4] \n\t"
+ "sub %[load6], %[load6], %[step1_11] \n\t"
+ "sh %[load5], 320(%[output]) \n\t"
+ "sh %[load6], 352(%[output]) \n\t"
+ "add %[load5], %[step1_3], %[step1_4] \n\t"
+ "sub %[load5], %[load5], %[step1_12] \n\t"
+ "add %[load6], %[step1_2], %[step1_5] \n\t"
+ "sub %[load6], %[load6], %[step1_13] \n\t"
+ "sh %[load5], 384(%[output]) \n\t"
+ "sh %[load6], 416(%[output]) \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6)
+ : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
+ [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
+ [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
+ [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
+
+ input += 16;
+ output += 1;
+ }
+}
+
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
+ int i;
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int step1_8, step1_9, step1_10, step1_11;
+ int step1_12, step1_13, step1_14, step1_15;
+ int step2_0, step2_1, step2_2, step2_3;
+ int step2_8, step2_9, step2_10, step2_11;
+ int step2_12, step2_13, step2_14, step2_15;
+ int load1, load2, load3, load4, load5, load6, load7, load8;
+ int result1, result2, result3, result4;
+ const int const_2_power_13 = 8192;
+ uint8_t *dest_pix;
+ uint8_t *cm = vpx_ff_cropTbl;
+
+ /* prefetch vpx_ff_cropTbl */
+ prefetch_load(vpx_ff_cropTbl);
+ prefetch_load(vpx_ff_cropTbl + 32);
+ prefetch_load(vpx_ff_cropTbl + 64);
+ prefetch_load(vpx_ff_cropTbl + 96);
+ prefetch_load(vpx_ff_cropTbl + 128);
+ prefetch_load(vpx_ff_cropTbl + 160);
+ prefetch_load(vpx_ff_cropTbl + 192);
+ prefetch_load(vpx_ff_cropTbl + 224);
+
+ for (i = 0; i < 16; ++i) {
+ dest_pix = (dest + i);
+ __asm__ __volatile__(
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 16(%[input]) \n\t"
+ "lh %[load3], 8(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[step2_0], $ac1, 31 \n\t"
+ "extp %[step2_1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[step2_2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[step2_3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[step2_0], %[step2_3] \n\t"
+ "add %[step1_1], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
+ [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
+ [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
+ [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+ [step1_3] "=r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "lh %[load5], 2(%[input]) \n\t"
+ "lh %[load6], 30(%[input]) \n\t"
+ "lh %[load7], 18(%[input]) \n\t"
+ "lh %[load8], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_2_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_14_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_18_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_14_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_2_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_30_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "sub %[load5], %[result1], %[result2] \n\t"
+ "sub %[load6], %[result4], %[result3] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load6], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load5], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load5], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[result1], %[result2] \n\t"
+ "add %[step2_15], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
+ [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
+ [step2_14] "=r"(step2_14)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 22(%[input]) \n\t"
+ "lh %[load3], 26(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_26_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_22_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac2, %[load4], %[cospi_6_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[result2], %[result1] \n\t"
+ "sub %[load2], %[result4], %[result3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[result1], %[result2] \n\t"
+ "add %[step2_12], %[result4], %[result3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
+ [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+ [step2_13] "=r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "lh %[load5], 4(%[input]) \n\t"
+ "lh %[load6], 28(%[input]) \n\t"
+ "lh %[load7], 20(%[input]) \n\t"
+ "lh %[load8], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_4_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_12_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_20_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_12_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_4_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_28_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[result4], %[result3] \n\t"
+ "sub %[load5], %[load5], %[result1] \n\t"
+ "add %[load5], %[load5], %[result2] \n\t"
+
+ "sub %[load6], %[result1], %[result2] \n\t"
+ "sub %[load6], %[load6], %[result3] \n\t"
+ "add %[load6], %[load6], %[result4] \n\t"
+
+ "madd $ac1, %[load5], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+
+ "add %[step1_4], %[result1], %[result2] \n\t"
+ "add %[step1_7], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [result3] "=&r"(result3),
+ [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
+ [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+ [step1_7] "=r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "sub %[load5], %[step2_14], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+
+ "madd $ac0, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_14], %[step2_13] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "add %[load6], %[load6], %[step2_9] \n\t"
+
+ "madd $ac1, %[load6], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[step2_15], %[step2_12] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "add %[load5], %[load5], %[step2_11] \n\t"
+
+ "madd $ac2, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_15], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_11] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_10], $ac0, 31 \n\t"
+ "extp %[step1_13], $ac1, 31 \n\t"
+ "extp %[step1_11], $ac2, 31 \n\t"
+ "extp %[step1_12], $ac3, 31 \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
+ [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
+ [step1_13] "=r"(step1_13)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
+ [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
+ [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
+ [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
+ [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
+
+ step1_8 = step2_8 + step2_11;
+ step1_9 = step2_9 + step2_10;
+ step1_14 = step2_13 + step2_14;
+ step1_15 = step2_12 + step2_15;
+
+ __asm__ __volatile__(
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_0], %[step1_7] \n\t"
+ "add %[load5], %[load5], %[step1_15] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_1], %[step1_6] \n\t"
+ "add %[load6], %[load6], %[step1_14] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_2], %[step1_5] \n\t"
+ "add %[load5], %[load5], %[step1_13] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_3], %[step1_4] \n\t"
+ "add %[load6], %[load6], %[step1_12] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_3], %[step1_4] \n\t"
+ "add %[load5], %[load5], %[step1_11] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_2], %[step1_5] \n\t"
+ "add %[load6], %[load6], %[step1_10] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "sub %[load5], %[step1_1], %[step1_6] \n\t"
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[load5], %[step1_9] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_0], %[step1_7] \n\t"
+ "add %[load6], %[load6], %[step1_8] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_0], %[step1_7] \n\t"
+ "sub %[load5], %[load5], %[step1_8] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_1], %[step1_6] \n\t"
+ "sub %[load6], %[load6], %[step1_9] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_2], %[step1_5] \n\t"
+ "sub %[load5], %[load5], %[step1_10] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_3], %[step1_4] \n\t"
+ "sub %[load6], %[load6], %[step1_11] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_3], %[step1_4] \n\t"
+ "sub %[load5], %[load5], %[step1_12] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_2], %[step1_5] \n\t"
+ "sub %[load6], %[load6], %[step1_13] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_1], %[step1_6] \n\t"
+ "sub %[load5], %[load5], %[step1_14] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_0], %[step1_7] \n\t"
+ "sub %[load6], %[load6], %[step1_15] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+
+ : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+ [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
+ :
+ [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
+ [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
+ [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
+ [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
+ [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
+ [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
+ [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
+
+ input += 16;
+ }
+}
+
+void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
+
+ // First transform rows
+ idct16_rows_dspr2(input, out, 16);
+
+ // Then transform columns and add to dest
+ idct16_cols_add_blk_dspr2(out, dest, stride);
+}
+
+void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ int16_t *outptr = out;
+ uint32_t i;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ idct16_rows_dspr2(input, outptr, 4);
+
+ outptr += 4;
+ for (i = 0; i < 6; ++i) {
+ __asm__ __volatile__(
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 64(%[outptr]) \n\t"
+ "sw $zero, 96(%[outptr]) \n\t"
+ "sw $zero, 128(%[outptr]) \n\t"
+ "sw $zero, 160(%[outptr]) \n\t"
+ "sw $zero, 192(%[outptr]) \n\t"
+ "sw $zero, 224(%[outptr]) \n\t"
+ "sw $zero, 256(%[outptr]) \n\t"
+ "sw $zero, 288(%[outptr]) \n\t"
+ "sw $zero, 320(%[outptr]) \n\t"
+ "sw $zero, 352(%[outptr]) \n\t"
+ "sw $zero, 384(%[outptr]) \n\t"
+ "sw $zero, 416(%[outptr]) \n\t"
+ "sw $zero, 448(%[outptr]) \n\t"
+ "sw $zero, 480(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r"(outptr));
+
+ outptr += 2;
+ }
+
+ // Then transform columns
+ idct16_cols_add_blk_dspr2(out, dest, stride);
+}
+
+void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ uint32_t pos = 45;
+ int32_t out;
+ int32_t r;
+ int32_t a1, absa1;
+ int32_t vector_a1;
+ int32_t t1, t2, t3, t4;
+ int32_t vector_1, vector_2, vector_3, vector_4;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r"(pos));
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__(
+ "addi %[out], %[out], 32 \n\t"
+ "sra %[a1], %[out], 6 \n\t"
+
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__(
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ } else if (a1 > 255) {
+ int32_t a11, a12, vector_a11, vector_a12;
+
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ a11 = a1 >> 1;
+ a12 = a1 - a11;
+ __asm__ __volatile__(
+ "replv.qb %[vector_a11], %[a11] \n\t"
+ "replv.qb %[vector_a12], %[a12] \n\t"
+
+ : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+ : [a11] "r"(a11), [a12] "r"(a12));
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+ [vector_a12] "r"(vector_a12));
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ }
+}
+
+void iadst16_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+ int x0 = input[15];
+ int x1 = input[0];
+ int x2 = input[13];
+ int x3 = input[2];
+ int x4 = input[11];
+ int x5 = input[4];
+ int x6 = input[9];
+ int x7 = input[6];
+ int x8 = input[7];
+ int x9 = input[8];
+ int x10 = input[5];
+ int x11 = input[10];
+ int x12 = input[3];
+ int x13 = input[12];
+ int x14 = input[1];
+ int x15 = input[14];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+ x13 | x14 | x15)) {
+ output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+ output[6] = output[7] = output[8] = output[9] = output[10] =
+ output[11] = output[12] = output[13] = output[14] = output[15] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = dct_const_round_shift(s0 + s8);
+ x1 = dct_const_round_shift(s1 + s9);
+ x2 = dct_const_round_shift(s2 + s10);
+ x3 = dct_const_round_shift(s3 + s11);
+ x4 = dct_const_round_shift(s4 + s12);
+ x5 = dct_const_round_shift(s5 + s13);
+ x6 = dct_const_round_shift(s6 + s14);
+ x7 = dct_const_round_shift(s7 + s15);
+ x8 = dct_const_round_shift(s0 - s8);
+ x9 = dct_const_round_shift(s1 - s9);
+ x10 = dct_const_round_shift(s2 - s10);
+ x11 = dct_const_round_shift(s3 - s11);
+ x12 = dct_const_round_shift(s4 - s12);
+ x13 = dct_const_round_shift(s5 - s13);
+ x14 = dct_const_round_shift(s6 - s14);
+ x15 = dct_const_round_shift(s7 - s15);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = s0 + s4;
+ x1 = s1 + s5;
+ x2 = s2 + s6;
+ x3 = s3 + s7;
+ x4 = s0 - s4;
+ x5 = s1 - s5;
+ x6 = s2 - s6;
+ x7 = s3 - s7;
+ x8 = dct_const_round_shift(s8 + s12);
+ x9 = dct_const_round_shift(s9 + s13);
+ x10 = dct_const_round_shift(s10 + s14);
+ x11 = dct_const_round_shift(s11 + s15);
+ x12 = dct_const_round_shift(s8 - s12);
+ x13 = dct_const_round_shift(s9 - s13);
+ x14 = dct_const_round_shift(s10 - s14);
+ x15 = dct_const_round_shift(s11 - s15);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = dct_const_round_shift(s4 + s6);
+ x5 = dct_const_round_shift(s5 + s7);
+ x6 = dct_const_round_shift(s4 - s6);
+ x7 = dct_const_round_shift(s5 - s7);
+ x8 = s8 + s10;
+ x9 = s9 + s11;
+ x10 = s8 - s10;
+ x11 = s9 - s11;
+ x12 = dct_const_round_shift(s12 + s14);
+ x13 = dct_const_round_shift(s13 + s15);
+ x14 = dct_const_round_shift(s12 - s14);
+ x15 = dct_const_round_shift(s13 - s15);
+
+ // stage 4
+ s2 = (-cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (-cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = dct_const_round_shift(s2);
+ x3 = dct_const_round_shift(s3);
+ x6 = dct_const_round_shift(s6);
+ x7 = dct_const_round_shift(s7);
+ x10 = dct_const_round_shift(s10);
+ x11 = dct_const_round_shift(s11);
+ x14 = dct_const_round_shift(s14);
+ x15 = dct_const_round_shift(s15);
+
+ output[0] = x0;
+ output[1] = -x8;
+ output[2] = x12;
+ output[3] = -x4;
+ output[4] = x6;
+ output[5] = x14;
+ output[6] = x10;
+ output[7] = x2;
+ output[8] = x3;
+ output[9] = x11;
+ output[10] = x15;
+ output[11] = x7;
+ output[12] = x5;
+ output[13] = -x13;
+ output[14] = x9;
+ output[15] = -x1;
+}
+
+#endif // HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
new file mode 100644
index 0000000000..3f043b48ba
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
@@ -0,0 +1,1119 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+ int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+ int step1_28, step1_29, step1_30, step1_31;
+ int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int step2_28, step2_29, step2_30, step2_31;
+ int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+ int step3_29, step3_30, step3_31;
+ int temp0, temp1, temp2, temp3;
+ int load1, load2, load3, load4;
+ int result1, result2;
+ int i;
+ uint8_t *dest_pix, *dest_pix1;
+ const int const_2_power_13 = 8192;
+ uint8_t *cm = vpx_ff_cropTbl;
+
+ /* prefetch vpx_ff_cropTbl */
+ prefetch_load(vpx_ff_cropTbl);
+ prefetch_load(vpx_ff_cropTbl + 32);
+ prefetch_load(vpx_ff_cropTbl + 64);
+ prefetch_load(vpx_ff_cropTbl + 96);
+ prefetch_load(vpx_ff_cropTbl + 128);
+ prefetch_load(vpx_ff_cropTbl + 160);
+ prefetch_load(vpx_ff_cropTbl + 192);
+ prefetch_load(vpx_ff_cropTbl + 224);
+
+ for (i = 0; i < 32; ++i) {
+ dest_pix = dest + i;
+ dest_pix1 = dest + i + 31 * stride;
+
+ __asm__ __volatile__(
+ "lh %[load1], 2(%[input]) \n\t"
+ "lh %[load2], 62(%[input]) \n\t"
+ "lh %[load3], 34(%[input]) \n\t"
+ "lh %[load4], 30(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_31_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_1_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_1_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_31_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_15_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_17_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_17_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_15_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_17], $ac1, 31 \n\t"
+ "extp %[step1_30], $ac3, 31 \n\t"
+ "add %[step1_16], %[temp0], %[temp1] \n\t"
+ "add %[step1_31], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
+ [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
+ [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 18(%[input]) \n\t"
+ "lh %[load2], 46(%[input]) \n\t"
+ "lh %[load3], 50(%[input]) \n\t"
+ "lh %[load4], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_23_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_9_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_9_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_23_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_7_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_25_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_25_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_7_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_18], $ac1, 31 \n\t"
+ "extp %[step1_29], $ac3, 31 \n\t"
+ "add %[step1_19], %[temp0], %[temp1] \n\t"
+ "add %[step1_28], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
+ [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
+ [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 54(%[input]) \n\t"
+ "lh %[load3], 42(%[input]) \n\t"
+ "lh %[load4], 22(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_27_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_5_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_5_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_27_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_11_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_21_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_21_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_11_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "madd $ac1, %[load2], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_12_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_20_64] \n\t"
+
+ "extp %[step1_21], $ac1, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+ "add %[step1_20], %[temp0], %[temp1] \n\t"
+ "add %[step1_27], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
+ [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
+ [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 26(%[input]) \n\t"
+ "lh %[load2], 38(%[input]) \n\t"
+ "lh %[load3], 58(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_19_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_13_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_13_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_19_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_3_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_29_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_29_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_3_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_20_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_12_64] \n\t"
+ "extp %[step1_22], $ac1, 31 \n\t"
+ "extp %[step1_25], $ac3, 31 \n\t"
+ "add %[step1_23], %[temp0], %[temp1] \n\t"
+ "add %[step1_24], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
+ [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
+ [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 4(%[input]) \n\t"
+ "lh %[load2], 60(%[input]) \n\t"
+ "lh %[load3], 36(%[input]) \n\t"
+ "lh %[load4], 28(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_2_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_2_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_30_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_14_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_18_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_14_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+ "msub $ac1, %[load1], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_8_64] \n\t"
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[temp0], %[temp1] \n\t"
+ "add %[step2_15], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
+ [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
+ [step2_15] "=&r"(step2_15)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 20(%[input]) \n\t"
+ "lh %[load2], 44(%[input]) \n\t"
+ "lh %[load3], 52(%[input]) \n\t"
+ "lh %[load4], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_22_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_26_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_6_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[temp0], %[temp1] \n\t"
+ "add %[step2_12], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
+ [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "sub %[temp0], %[step2_14], %[step2_13] \n\t"
+ "sub %[temp0], %[temp0], %[step2_9] \n\t"
+ "add %[temp0], %[temp0], %[step2_10] \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp1], %[step2_14], %[step2_13] \n\t"
+ "add %[temp1], %[temp1], %[step2_9] \n\t"
+ "sub %[temp1], %[temp1], %[step2_10] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "sub %[temp0], %[step2_15], %[step2_12] \n\t"
+ "sub %[temp0], %[temp0], %[step2_8] \n\t"
+ "add %[temp0], %[temp0], %[step2_11] \n\t"
+ "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sub %[temp1], %[step2_15], %[step2_12] \n\t"
+ "add %[temp1], %[temp1], %[step2_8] \n\t"
+ "sub %[temp1], %[temp1], %[step2_11] \n\t"
+ "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
+
+ "add %[step3_8], %[step2_8], %[step2_11] \n\t"
+ "add %[step3_9], %[step2_9], %[step2_10] \n\t"
+ "add %[step3_14], %[step2_13], %[step2_14] \n\t"
+ "add %[step3_15], %[step2_12], %[step2_15] \n\t"
+ "extp %[step3_10], $ac0, 31 \n\t"
+ "extp %[step3_13], $ac1, 31 \n\t"
+ "extp %[step3_11], $ac2, 31 \n\t"
+ "extp %[step3_12], $ac3, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
+ [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
+ [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
+ [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
+ [step3_15] "=&r"(step3_15)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
+ [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
+ [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
+ [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
+ [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_17], %[step1_18] \n\t"
+ "sub %[temp1], %[step1_30], %[step1_29] \n\t"
+ "add %[step3_17], %[step1_17], %[step1_18] \n\t"
+ "add %[step3_30], %[step1_30], %[step1_29] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+ "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_29], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
+ [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
+ [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
+ [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_16], %[step1_19] \n\t"
+ "sub %[temp1], %[step1_31], %[step1_28] \n\t"
+ "add %[step3_16], %[step1_16], %[step1_19] \n\t"
+ "add %[step3_31], %[step1_31], %[step1_28] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+ "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_28], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
+ [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
+ [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
+ [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_23], %[step1_20] \n\t"
+ "sub %[temp1], %[step1_24], %[step1_27] \n\t"
+ "add %[step3_23], %[step1_23], %[step1_20] \n\t"
+ "add %[step3_24], %[step1_24], %[step1_27] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_27], $ac0, 31 \n\t"
+ "msub $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "msub $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
+ [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
+ [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
+ [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_22], %[step1_21] \n\t"
+ "sub %[temp1], %[step1_25], %[step1_26] \n\t"
+ "add %[step3_22], %[step1_22], %[step1_21] \n\t"
+ "add %[step3_25], %[step1_25], %[step1_26] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac0, 31 \n\t"
+ "msub $ac1, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_26], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
+ [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
+ [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
+ [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "add %[step2_16], %[step3_16], %[step3_23] \n\t"
+ "add %[step2_17], %[step3_17], %[step3_22] \n\t"
+ "add %[step2_18], %[step3_18], %[step3_21] \n\t"
+ "add %[step2_19], %[step3_19], %[step3_20] \n\t"
+ "sub %[step2_20], %[step3_19], %[step3_20] \n\t"
+ "sub %[step2_21], %[step3_18], %[step3_21] \n\t"
+ "sub %[step2_22], %[step3_17], %[step3_22] \n\t"
+ "sub %[step2_23], %[step3_16], %[step3_23] \n\t"
+
+ : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
+ [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
+ [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
+ [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
+ : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
+ [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
+ [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
+ [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
+
+ __asm__ __volatile__(
+ "sub %[step2_24], %[step3_31], %[step3_24] \n\t"
+ "sub %[step2_25], %[step3_30], %[step3_25] \n\t"
+ "sub %[step2_26], %[step3_29], %[step3_26] \n\t"
+ "sub %[step2_27], %[step3_28], %[step3_27] \n\t"
+ "add %[step2_28], %[step3_28], %[step3_27] \n\t"
+ "add %[step2_29], %[step3_29], %[step3_26] \n\t"
+ "add %[step2_30], %[step3_30], %[step3_25] \n\t"
+ "add %[step2_31], %[step3_31], %[step3_24] \n\t"
+
+ : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
+ [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
+ [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
+ [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
+ : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
+ [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
+ [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
+ [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
+
+ __asm__ __volatile__(
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 32(%[input]) \n\t"
+ "lh %[load3], 16(%[input]) \n\t"
+ "lh %[load4], 48(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[temp2], $ac3, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[temp3], $ac1, 31 \n\t"
+ "add %[step1_0], %[temp0], %[temp3] \n\t"
+ "add %[step1_1], %[temp1], %[temp2] \n\t"
+ "sub %[step1_2], %[temp1], %[temp2] \n\t"
+ "sub %[step1_3], %[temp0], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
+ [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
+ [step1_3] "=&r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 8(%[input]) \n\t"
+ "lh %[load2], 56(%[input]) \n\t"
+ "lh %[load3], 40(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_12_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_20_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_12_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load1], %[load1], %[temp0] \n\t"
+ "add %[load1], %[load1], %[temp1] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[load2], %[temp2] \n\t"
+ "add %[load2], %[load2], %[temp3] \n\t"
+ "madd $ac1, %[load1], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[temp0], %[temp1] \n\t"
+ "add %[step1_7], %[temp3], %[temp2] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
+ [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
+ [step1_7] "=&r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "add %[step2_0], %[step1_0], %[step1_7] \n\t"
+ "add %[step2_1], %[step1_1], %[step1_6] \n\t"
+ "add %[step2_2], %[step1_2], %[step1_5] \n\t"
+ "add %[step2_3], %[step1_3], %[step1_4] \n\t"
+ "sub %[step2_4], %[step1_3], %[step1_4] \n\t"
+ "sub %[step2_5], %[step1_2], %[step1_5] \n\t"
+ "sub %[step2_6], %[step1_1], %[step1_6] \n\t"
+ "sub %[step2_7], %[step1_0], %[step1_7] \n\t"
+
+ : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
+ [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
+ [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
+ [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
+ : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
+ [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
+ [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
+ [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
+
+ // stage 7
+ __asm__ __volatile__(
+ "add %[step1_0], %[step2_0], %[step3_15] \n\t"
+ "add %[step1_1], %[step2_1], %[step3_14] \n\t"
+ "add %[step1_2], %[step2_2], %[step3_13] \n\t"
+ "add %[step1_3], %[step2_3], %[step3_12] \n\t"
+ "sub %[step1_12], %[step2_3], %[step3_12] \n\t"
+ "sub %[step1_13], %[step2_2], %[step3_13] \n\t"
+ "sub %[step1_14], %[step2_1], %[step3_14] \n\t"
+ "sub %[step1_15], %[step2_0], %[step3_15] \n\t"
+
+ : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
+ [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
+ [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
+ [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
+ : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
+ [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
+ [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
+ [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
+
+ __asm__ __volatile__(
+ "add %[step1_4], %[step2_4], %[step3_11] \n\t"
+ "add %[step1_5], %[step2_5], %[step3_10] \n\t"
+ "add %[step1_6], %[step2_6], %[step3_9] \n\t"
+ "add %[step1_7], %[step2_7], %[step3_8] \n\t"
+ "sub %[step1_8], %[step2_7], %[step3_8] \n\t"
+ "sub %[step1_9], %[step2_6], %[step3_9] \n\t"
+ "sub %[step1_10], %[step2_5], %[step3_10] \n\t"
+ "sub %[step1_11], %[step2_4], %[step3_11] \n\t"
+
+ : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
+ [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
+ [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
+ [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
+ : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
+ [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
+ [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
+ [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "add %[temp1], %[step2_27], %[step2_20] \n\t"
+ "sub %[temp2], %[step2_26], %[step2_21] \n\t"
+ "add %[temp3], %[step2_26], %[step2_21] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
+ "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
+
+ "extp %[step1_20], $ac0, 31 \n\t"
+ "extp %[step1_27], $ac1, 31 \n\t"
+ "extp %[step1_21], $ac2, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
+ [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
+ [step1_26] "=&r"(step1_26)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
+ [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_25], %[step2_22] \n\t"
+ "add %[temp1], %[step2_25], %[step2_22] \n\t"
+ "sub %[temp2], %[step2_24], %[step2_23] \n\t"
+ "add %[temp3], %[step2_24], %[step2_23] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
+ "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
+
+ "extp %[step1_22], $ac0, 31 \n\t"
+ "extp %[step1_25], $ac1, 31 \n\t"
+ "extp %[step1_23], $ac2, 31 \n\t"
+ "extp %[step1_24], $ac3, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
+ [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
+ [step1_24] "=&r"(step1_24)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
+ [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
+ [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_0], %[step2_31] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_1], %[step2_30] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_2], %[step2_29] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_3], %[step2_28] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
+ [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
+ [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
+ [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
+ [step2_31] "r"(step2_31));
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_4], %[step1_27] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_5], %[step1_26] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_6], %[step1_25] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_7], %[step1_24] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4),
+ [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
+ [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
+ [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
+ [step1_27] "r"(step1_27));
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_8], %[step1_23] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_9], %[step1_22] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_10], %[step1_21] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_11], %[step1_20] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8),
+ [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
+ [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
+ [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
+ [step1_23] "r"(step1_23));
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_12], %[step2_19] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_13], %[step2_18] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_14], %[step2_17] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_15], %[step2_16] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12),
+ [step1_13] "r"(step1_13), [step1_14] "r"(step1_14),
+ [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
+ [step2_17] "r"(step2_17), [step2_18] "r"(step2_18),
+ [step2_19] "r"(step2_19));
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
+
+ __asm__ __volatile__(
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
+
+ input += 32;
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans32_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans32_dspr2.c
new file mode 100644
index 0000000000..3c0468c00f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans32_dspr2.c
@@ -0,0 +1,1218 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
+ uint32_t no_rows) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+ int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+ int step1_28, step1_29, step1_30, step1_31;
+ int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int step2_28, step2_29, step2_30, step2_31;
+ int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+ int step3_29, step3_30, step3_31;
+ int temp0, temp1, temp2, temp3;
+ int load1, load2, load3, load4;
+ int result1, result2;
+ int i;
+ const int const_2_power_13 = 8192;
+ const int32_t *input_int;
+
+ for (i = no_rows; i--;) {
+ input_int = (const int32_t *)input;
+
+ if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
+ input_int[4] | input_int[5] | input_int[6] | input_int[7] |
+ input_int[8] | input_int[9] | input_int[10] | input_int[11] |
+ input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
+ input += 32;
+
+ __asm__ __volatile__(
+ "sh $zero, 0(%[output]) \n\t"
+ "sh $zero, 64(%[output]) \n\t"
+ "sh $zero, 128(%[output]) \n\t"
+ "sh $zero, 192(%[output]) \n\t"
+ "sh $zero, 256(%[output]) \n\t"
+ "sh $zero, 320(%[output]) \n\t"
+ "sh $zero, 384(%[output]) \n\t"
+ "sh $zero, 448(%[output]) \n\t"
+ "sh $zero, 512(%[output]) \n\t"
+ "sh $zero, 576(%[output]) \n\t"
+ "sh $zero, 640(%[output]) \n\t"
+ "sh $zero, 704(%[output]) \n\t"
+ "sh $zero, 768(%[output]) \n\t"
+ "sh $zero, 832(%[output]) \n\t"
+ "sh $zero, 896(%[output]) \n\t"
+ "sh $zero, 960(%[output]) \n\t"
+ "sh $zero, 1024(%[output]) \n\t"
+ "sh $zero, 1088(%[output]) \n\t"
+ "sh $zero, 1152(%[output]) \n\t"
+ "sh $zero, 1216(%[output]) \n\t"
+ "sh $zero, 1280(%[output]) \n\t"
+ "sh $zero, 1344(%[output]) \n\t"
+ "sh $zero, 1408(%[output]) \n\t"
+ "sh $zero, 1472(%[output]) \n\t"
+ "sh $zero, 1536(%[output]) \n\t"
+ "sh $zero, 1600(%[output]) \n\t"
+ "sh $zero, 1664(%[output]) \n\t"
+ "sh $zero, 1728(%[output]) \n\t"
+ "sh $zero, 1792(%[output]) \n\t"
+ "sh $zero, 1856(%[output]) \n\t"
+ "sh $zero, 1920(%[output]) \n\t"
+ "sh $zero, 1984(%[output]) \n\t"
+
+ :
+ : [output] "r"(output));
+
+ output += 1;
+
+ continue;
+ }
+
+ /* prefetch row */
+ prefetch_load((const uint8_t *)(input + 32));
+ prefetch_load((const uint8_t *)(input + 48));
+
+ __asm__ __volatile__(
+ "lh %[load1], 2(%[input]) \n\t"
+ "lh %[load2], 62(%[input]) \n\t"
+ "lh %[load3], 34(%[input]) \n\t"
+ "lh %[load4], 30(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_31_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_1_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_1_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_31_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_15_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_17_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_17_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_15_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_17], $ac1, 31 \n\t"
+ "extp %[step1_30], $ac3, 31 \n\t"
+ "add %[step1_16], %[temp0], %[temp1] \n\t"
+ "add %[step1_31], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
+ [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
+ [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 18(%[input]) \n\t"
+ "lh %[load2], 46(%[input]) \n\t"
+ "lh %[load3], 50(%[input]) \n\t"
+ "lh %[load4], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_23_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_9_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_9_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_23_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_7_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_25_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_25_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_7_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_18], $ac1, 31 \n\t"
+ "extp %[step1_29], $ac3, 31 \n\t"
+ "add %[step1_19], %[temp0], %[temp1] \n\t"
+ "add %[step1_28], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
+ [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
+ [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 54(%[input]) \n\t"
+ "lh %[load3], 42(%[input]) \n\t"
+ "lh %[load4], 22(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_27_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_5_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_5_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_27_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_11_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_21_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_21_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_11_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "madd $ac1, %[load2], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_12_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_20_64] \n\t"
+
+ "extp %[step1_21], $ac1, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+ "add %[step1_20], %[temp0], %[temp1] \n\t"
+ "add %[step1_27], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
+ [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
+ [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 26(%[input]) \n\t"
+ "lh %[load2], 38(%[input]) \n\t"
+ "lh %[load3], 58(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_19_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_13_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_13_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_19_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_3_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_29_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_29_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_3_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_20_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_12_64] \n\t"
+ "extp %[step1_22], $ac1, 31 \n\t"
+ "extp %[step1_25], $ac3, 31 \n\t"
+ "add %[step1_23], %[temp0], %[temp1] \n\t"
+ "add %[step1_24], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
+ [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
+ [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
+ [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 4(%[input]) \n\t"
+ "lh %[load2], 60(%[input]) \n\t"
+ "lh %[load3], 36(%[input]) \n\t"
+ "lh %[load4], 28(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_2_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_2_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_30_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_14_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_18_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_14_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+ "msub $ac1, %[load1], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_8_64] \n\t"
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[temp0], %[temp1] \n\t"
+ "add %[step2_15], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
+ [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
+ [step2_15] "=&r"(step2_15)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+ [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 20(%[input]) \n\t"
+ "lh %[load2], 44(%[input]) \n\t"
+ "lh %[load3], 52(%[input]) \n\t"
+ "lh %[load4], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_22_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_26_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_6_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[temp0], %[temp1] \n\t"
+ "add %[step2_12], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
+ [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+ [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "sub %[temp0], %[step2_14], %[step2_13] \n\t"
+ "sub %[temp0], %[temp0], %[step2_9] \n\t"
+ "add %[temp0], %[temp0], %[step2_10] \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp1], %[step2_14], %[step2_13] \n\t"
+ "add %[temp1], %[temp1], %[step2_9] \n\t"
+ "sub %[temp1], %[temp1], %[step2_10] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "sub %[temp0], %[step2_15], %[step2_12] \n\t"
+ "sub %[temp0], %[temp0], %[step2_8] \n\t"
+ "add %[temp0], %[temp0], %[step2_11] \n\t"
+ "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sub %[temp1], %[step2_15], %[step2_12] \n\t"
+ "add %[temp1], %[temp1], %[step2_8] \n\t"
+ "sub %[temp1], %[temp1], %[step2_11] \n\t"
+ "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
+
+ "add %[step3_8], %[step2_8], %[step2_11] \n\t"
+ "add %[step3_9], %[step2_9], %[step2_10] \n\t"
+ "add %[step3_14], %[step2_13], %[step2_14] \n\t"
+ "add %[step3_15], %[step2_12], %[step2_15] \n\t"
+ "extp %[step3_10], $ac0, 31 \n\t"
+ "extp %[step3_13], $ac1, 31 \n\t"
+ "extp %[step3_11], $ac2, 31 \n\t"
+ "extp %[step3_12], $ac3, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
+ [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
+ [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
+ [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
+ [step3_15] "=&r"(step3_15)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
+ [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
+ [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
+ [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
+ [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_17], %[step1_18] \n\t"
+ "sub %[temp1], %[step1_30], %[step1_29] \n\t"
+ "add %[step3_17], %[step1_17], %[step1_18] \n\t"
+ "add %[step3_30], %[step1_30], %[step1_29] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+ "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_29], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
+ [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
+ [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
+ [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_16], %[step1_19] \n\t"
+ "sub %[temp1], %[step1_31], %[step1_28] \n\t"
+ "add %[step3_16], %[step1_16], %[step1_19] \n\t"
+ "add %[step3_31], %[step1_31], %[step1_28] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+ "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_28], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
+ [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
+ [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
+ [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_23], %[step1_20] \n\t"
+ "sub %[temp1], %[step1_24], %[step1_27] \n\t"
+ "add %[step3_23], %[step1_23], %[step1_20] \n\t"
+ "add %[step3_24], %[step1_24], %[step1_27] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_27], $ac0, 31 \n\t"
+ "msub $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "msub $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
+ [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
+ [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
+ [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_22], %[step1_21] \n\t"
+ "sub %[temp1], %[step1_25], %[step1_26] \n\t"
+ "add %[step3_22], %[step1_22], %[step1_21] \n\t"
+ "add %[step3_25], %[step1_25], %[step1_26] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac0, 31 \n\t"
+ "msub $ac1, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_26], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
+ [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
+ [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
+ [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
+ [cospi_8_64] "r"(cospi_8_64));
+
+ __asm__ __volatile__(
+ "add %[step2_16], %[step3_16], %[step3_23] \n\t"
+ "add %[step2_17], %[step3_17], %[step3_22] \n\t"
+ "add %[step2_18], %[step3_18], %[step3_21] \n\t"
+ "add %[step2_19], %[step3_19], %[step3_20] \n\t"
+ "sub %[step2_20], %[step3_19], %[step3_20] \n\t"
+ "sub %[step2_21], %[step3_18], %[step3_21] \n\t"
+ "sub %[step2_22], %[step3_17], %[step3_22] \n\t"
+ "sub %[step2_23], %[step3_16], %[step3_23] \n\t"
+
+ : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
+ [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
+ [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
+ [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
+ : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
+ [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
+ [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
+ [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
+
+ __asm__ __volatile__(
+ "sub %[step2_24], %[step3_31], %[step3_24] \n\t"
+ "sub %[step2_25], %[step3_30], %[step3_25] \n\t"
+ "sub %[step2_26], %[step3_29], %[step3_26] \n\t"
+ "sub %[step2_27], %[step3_28], %[step3_27] \n\t"
+ "add %[step2_28], %[step3_28], %[step3_27] \n\t"
+ "add %[step2_29], %[step3_29], %[step3_26] \n\t"
+ "add %[step2_30], %[step3_30], %[step3_25] \n\t"
+ "add %[step2_31], %[step3_31], %[step3_24] \n\t"
+
+ : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
+ [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
+ [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
+ [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
+ : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
+ [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
+ [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
+ [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
+
+ __asm__ __volatile__(
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 32(%[input]) \n\t"
+ "lh %[load3], 16(%[input]) \n\t"
+ "lh %[load4], 48(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[temp2], $ac3, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[temp3], $ac1, 31 \n\t"
+ "add %[step1_0], %[temp0], %[temp3] \n\t"
+ "add %[step1_1], %[temp1], %[temp2] \n\t"
+ "sub %[step1_2], %[temp1], %[temp2] \n\t"
+ "sub %[step1_3], %[temp0], %[temp3] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [result1] "=&r"(result1),
+ [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
+ [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
+ [step1_3] "=&r"(step1_3)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "lh %[load1], 8(%[input]) \n\t"
+ "lh %[load2], 56(%[input]) \n\t"
+ "lh %[load3], 40(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_12_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_20_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_12_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load1], %[load1], %[temp0] \n\t"
+ "add %[load1], %[load1], %[temp1] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[load2], %[temp2] \n\t"
+ "add %[load2], %[load2], %[temp3] \n\t"
+ "madd $ac1, %[load1], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[temp0], %[temp1] \n\t"
+ "add %[step1_7], %[temp3], %[temp2] \n\t"
+
+ : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+ [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
+ [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
+ [step1_7] "=&r"(step1_7)
+ : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "add %[step2_0], %[step1_0], %[step1_7] \n\t"
+ "add %[step2_1], %[step1_1], %[step1_6] \n\t"
+ "add %[step2_2], %[step1_2], %[step1_5] \n\t"
+ "add %[step2_3], %[step1_3], %[step1_4] \n\t"
+ "sub %[step2_4], %[step1_3], %[step1_4] \n\t"
+ "sub %[step2_5], %[step1_2], %[step1_5] \n\t"
+ "sub %[step2_6], %[step1_1], %[step1_6] \n\t"
+ "sub %[step2_7], %[step1_0], %[step1_7] \n\t"
+
+ : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
+ [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
+ [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
+ [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
+ : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
+ [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
+ [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
+ [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
+
+ // stage 7
+ __asm__ __volatile__(
+ "add %[step1_0], %[step2_0], %[step3_15] \n\t"
+ "add %[step1_1], %[step2_1], %[step3_14] \n\t"
+ "add %[step1_2], %[step2_2], %[step3_13] \n\t"
+ "add %[step1_3], %[step2_3], %[step3_12] \n\t"
+ "sub %[step1_12], %[step2_3], %[step3_12] \n\t"
+ "sub %[step1_13], %[step2_2], %[step3_13] \n\t"
+ "sub %[step1_14], %[step2_1], %[step3_14] \n\t"
+ "sub %[step1_15], %[step2_0], %[step3_15] \n\t"
+
+ : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
+ [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
+ [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
+ [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
+ : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
+ [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
+ [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
+ [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
+
+ __asm__ __volatile__(
+ "add %[step1_4], %[step2_4], %[step3_11] \n\t"
+ "add %[step1_5], %[step2_5], %[step3_10] \n\t"
+ "add %[step1_6], %[step2_6], %[step3_9] \n\t"
+ "add %[step1_7], %[step2_7], %[step3_8] \n\t"
+ "sub %[step1_8], %[step2_7], %[step3_8] \n\t"
+ "sub %[step1_9], %[step2_6], %[step3_9] \n\t"
+ "sub %[step1_10], %[step2_5], %[step3_10] \n\t"
+ "sub %[step1_11], %[step2_4], %[step3_11] \n\t"
+
+ : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
+ [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
+ [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
+ [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
+ : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
+ [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
+ [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
+ [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "add %[temp1], %[step2_27], %[step2_20] \n\t"
+ "sub %[temp2], %[step2_26], %[step2_21] \n\t"
+ "add %[temp3], %[step2_26], %[step2_21] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
+ "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
+
+ "extp %[step1_20], $ac0, 31 \n\t"
+ "extp %[step1_27], $ac1, 31 \n\t"
+ "extp %[step1_21], $ac2, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
+ [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
+ [step1_26] "=&r"(step1_26)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
+ [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
+
+ __asm__ __volatile__(
+ "sub %[temp0], %[step2_25], %[step2_22] \n\t"
+ "add %[temp1], %[step2_25], %[step2_22] \n\t"
+ "sub %[temp2], %[step2_24], %[step2_23] \n\t"
+ "add %[temp3], %[step2_24], %[step2_23] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
+ "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
+
+ "extp %[step1_22], $ac0, 31 \n\t"
+ "extp %[step1_25], $ac1, 31 \n\t"
+ "extp %[step1_23], $ac2, 31 \n\t"
+ "extp %[step1_24], $ac3, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
+ [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
+ [step1_24] "=&r"(step1_24)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
+ [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
+ [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
+
+ // final stage
+ __asm__ __volatile__(
+ "add %[temp0], %[step1_0], %[step2_31] \n\t"
+ "add %[temp1], %[step1_1], %[step2_30] \n\t"
+ "add %[temp2], %[step1_2], %[step2_29] \n\t"
+ "add %[temp3], %[step1_3], %[step2_28] \n\t"
+ "sub %[load1], %[step1_3], %[step2_28] \n\t"
+ "sub %[load2], %[step1_2], %[step2_29] \n\t"
+ "sub %[load3], %[step1_1], %[step2_30] \n\t"
+ "sub %[load4], %[step1_0], %[step2_31] \n\t"
+ "sh %[temp0], 0(%[output]) \n\t"
+ "sh %[temp1], 64(%[output]) \n\t"
+ "sh %[temp2], 128(%[output]) \n\t"
+ "sh %[temp3], 192(%[output]) \n\t"
+ "sh %[load1], 1792(%[output]) \n\t"
+ "sh %[load2], 1856(%[output]) \n\t"
+ "sh %[load3], 1920(%[output]) \n\t"
+ "sh %[load4], 1984(%[output]) \n\t"
+
+ : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+ [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+ [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+ : [step1_0] "r"(step1_0), [step2_31] "r"(step2_31),
+ [step1_1] "r"(step1_1), [step2_30] "r"(step2_30),
+ [step1_2] "r"(step1_2), [step2_29] "r"(step2_29),
+ [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
+ [output] "r"(output));
+
+ __asm__ __volatile__(
+ "add %[temp0], %[step1_4], %[step1_27] \n\t"
+ "add %[temp1], %[step1_5], %[step1_26] \n\t"
+ "add %[temp2], %[step1_6], %[step1_25] \n\t"
+ "add %[temp3], %[step1_7], %[step1_24] \n\t"
+ "sub %[load1], %[step1_7], %[step1_24] \n\t"
+ "sub %[load2], %[step1_6], %[step1_25] \n\t"
+ "sub %[load3], %[step1_5], %[step1_26] \n\t"
+ "sub %[load4], %[step1_4], %[step1_27] \n\t"
+ "sh %[temp0], 256(%[output]) \n\t"
+ "sh %[temp1], 320(%[output]) \n\t"
+ "sh %[temp2], 384(%[output]) \n\t"
+ "sh %[temp3], 448(%[output]) \n\t"
+ "sh %[load1], 1536(%[output]) \n\t"
+ "sh %[load2], 1600(%[output]) \n\t"
+ "sh %[load3], 1664(%[output]) \n\t"
+ "sh %[load4], 1728(%[output]) \n\t"
+
+ : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+ [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+ [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+ : [step1_4] "r"(step1_4), [step1_27] "r"(step1_27),
+ [step1_5] "r"(step1_5), [step1_26] "r"(step1_26),
+ [step1_6] "r"(step1_6), [step1_25] "r"(step1_25),
+ [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
+ [output] "r"(output));
+
+ __asm__ __volatile__(
+ "add %[temp0], %[step1_8], %[step1_23] \n\t"
+ "add %[temp1], %[step1_9], %[step1_22] \n\t"
+ "add %[temp2], %[step1_10], %[step1_21] \n\t"
+ "add %[temp3], %[step1_11], %[step1_20] \n\t"
+ "sub %[load1], %[step1_11], %[step1_20] \n\t"
+ "sub %[load2], %[step1_10], %[step1_21] \n\t"
+ "sub %[load3], %[step1_9], %[step1_22] \n\t"
+ "sub %[load4], %[step1_8], %[step1_23] \n\t"
+ "sh %[temp0], 512(%[output]) \n\t"
+ "sh %[temp1], 576(%[output]) \n\t"
+ "sh %[temp2], 640(%[output]) \n\t"
+ "sh %[temp3], 704(%[output]) \n\t"
+ "sh %[load1], 1280(%[output]) \n\t"
+ "sh %[load2], 1344(%[output]) \n\t"
+ "sh %[load3], 1408(%[output]) \n\t"
+ "sh %[load4], 1472(%[output]) \n\t"
+
+ : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+ [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+ [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+ : [step1_8] "r"(step1_8), [step1_23] "r"(step1_23),
+ [step1_9] "r"(step1_9), [step1_22] "r"(step1_22),
+ [step1_10] "r"(step1_10), [step1_21] "r"(step1_21),
+ [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
+ [output] "r"(output));
+
+ __asm__ __volatile__(
+ "add %[temp0], %[step1_12], %[step2_19] \n\t"
+ "add %[temp1], %[step1_13], %[step2_18] \n\t"
+ "add %[temp2], %[step1_14], %[step2_17] \n\t"
+ "add %[temp3], %[step1_15], %[step2_16] \n\t"
+ "sub %[load1], %[step1_15], %[step2_16] \n\t"
+ "sub %[load2], %[step1_14], %[step2_17] \n\t"
+ "sub %[load3], %[step1_13], %[step2_18] \n\t"
+ "sub %[load4], %[step1_12], %[step2_19] \n\t"
+ "sh %[temp0], 768(%[output]) \n\t"
+ "sh %[temp1], 832(%[output]) \n\t"
+ "sh %[temp2], 896(%[output]) \n\t"
+ "sh %[temp3], 960(%[output]) \n\t"
+ "sh %[load1], 1024(%[output]) \n\t"
+ "sh %[load2], 1088(%[output]) \n\t"
+ "sh %[load3], 1152(%[output]) \n\t"
+ "sh %[load4], 1216(%[output]) \n\t"
+
+ : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+ [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+ [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+ : [step1_12] "r"(step1_12), [step2_19] "r"(step2_19),
+ [step1_13] "r"(step1_13), [step2_18] "r"(step2_18),
+ [step1_14] "r"(step1_14), [step2_17] "r"(step2_17),
+ [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
+ [output] "r"(output));
+
+ input += 32;
+ output += 1;
+ }
+}
+
+void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ // Rows
+ idct32_rows_dspr2(input, outptr, 32);
+
+ // Columns
+ vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
+}
+
+void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
+ int16_t *outptr = out;
+ uint32_t i;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ // Rows
+ idct32_rows_dspr2(input, outptr, 8);
+
+ outptr += 8;
+ __asm__ __volatile__(
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 4(%[outptr]) \n\t"
+ "sw $zero, 8(%[outptr]) \n\t"
+ "sw $zero, 12(%[outptr]) \n\t"
+ "sw $zero, 16(%[outptr]) \n\t"
+ "sw $zero, 20(%[outptr]) \n\t"
+ "sw $zero, 24(%[outptr]) \n\t"
+ "sw $zero, 28(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 36(%[outptr]) \n\t"
+ "sw $zero, 40(%[outptr]) \n\t"
+ "sw $zero, 44(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r"(outptr));
+
+ for (i = 0; i < 31; ++i) {
+ outptr += 32;
+
+ __asm__ __volatile__(
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 4(%[outptr]) \n\t"
+ "sw $zero, 8(%[outptr]) \n\t"
+ "sw $zero, 12(%[outptr]) \n\t"
+ "sw $zero, 16(%[outptr]) \n\t"
+ "sw $zero, 20(%[outptr]) \n\t"
+ "sw $zero, 24(%[outptr]) \n\t"
+ "sw $zero, 28(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 36(%[outptr]) \n\t"
+ "sw $zero, 40(%[outptr]) \n\t"
+ "sw $zero, 44(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r"(outptr));
+ }
+
+ // Columns
+ vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
+}
+
+void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ int r, out;
+ int32_t a1, absa1;
+ int32_t vector_a1;
+ int32_t t1, t2, t3, t4;
+ int32_t vector_1, vector_2, vector_3, vector_4;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r"(pos));
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__(
+ "addi %[out], %[out], 32 \n\t"
+ "sra %[a1], %[out], 6 \n\t"
+
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__(
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=&r"(absa1), [vector_a1] "=&r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ } else if (a1 > 255) {
+ int32_t a11, a12, vector_a11, vector_a12;
+
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ a11 = a1 >> 1;
+ a12 = a1 - a11;
+ __asm__ __volatile__(
+ "replv.qb %[vector_a11], %[a11] \n\t"
+ "replv.qb %[vector_a12], %[a12] \n\t"
+
+ : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+ : [a11] "r"(a11), [a12] "r"(a12));
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+ [vector_a12] "r"(vector_a12));
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=&r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c
new file mode 100644
index 0000000000..e214b538d4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
+ int step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
+ const int const_2_power_13 = 8192;
+ int i;
+
+ for (i = 4; i--;) {
+ __asm__ __volatile__(
+ /*
+ temp_1 = (input[0] + input[2]) * cospi_16_64;
+ step_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[2]) * cospi_16_64;
+ step_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 4(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "extp %[step_0], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "extp %[step_1], $ac1, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ /*
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ step_2 = dct_const_round_shift(temp1);
+ */
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "extp %[step_2], $ac0, 31 \n\t"
+
+ /*
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step_3 = dct_const_round_shift(temp2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[step_3], $ac1, 31 \n\t"
+
+ /*
+ output[0] = step_0 + step_3;
+ output[4] = step_1 + step_2;
+ output[8] = step_1 - step_2;
+ output[12] = step_0 - step_3;
+ */
+ "add %[Temp0], %[step_0], %[step_3] \n\t"
+ "sh %[Temp0], 0(%[output]) \n\t"
+
+ "add %[Temp1], %[step_1], %[step_2] \n\t"
+ "sh %[Temp1], 8(%[output]) \n\t"
+
+ "sub %[Temp2], %[step_1], %[step_2] \n\t"
+ "sh %[Temp2], 16(%[output]) \n\t"
+
+ "sub %[Temp3], %[step_0], %[step_3] \n\t"
+ "sh %[Temp3], 24(%[output]) \n\t"
+
+ : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
+ [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output)
+ : [const_2_power_13] "r"(const_2_power_13),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
+ [cospi_24_64] "r"(cospi_24_64), [input] "r"(input));
+
+ input += 4;
+ output += 1;
+ }
+}
+
+void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int stride) {
+ int step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
+ const int const_2_power_13 = 8192;
+ const int const_255 = 255;
+ int i;
+ uint8_t *dest_pix;
+
+ for (i = 0; i < 4; ++i) {
+ dest_pix = (dest + i);
+
+ __asm__ __volatile__(
+ /*
+ temp_1 = (input[0] + input[2]) * cospi_16_64;
+ step_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[2]) * cospi_16_64;
+ step_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 4(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "extp %[step_0], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "extp %[step_1], $ac1, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ /*
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ step_2 = dct_const_round_shift(temp1);
+ */
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "extp %[step_2], $ac0, 31 \n\t"
+
+ /*
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step_3 = dct_const_round_shift(temp2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[step_3], $ac1, 31 \n\t"
+
+ /*
+ output[0] = step_0 + step_3;
+ output[4] = step_1 + step_2;
+ output[8] = step_1 - step_2;
+ output[12] = step_0 - step_3;
+ */
+ "add %[Temp0], %[step_0], %[step_3] \n\t"
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "add %[Temp0], %[step_1], %[step_2] \n\t"
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "sub %[Temp0], %[step_1], %[step_2] \n\t"
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "sub %[Temp0], %[step_0], %[step_3] \n\t"
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+
+ : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
+ [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
+ [dest_pix] "+r"(dest_pix)
+ : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255),
+ [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
+ [cospi_24_64] "r"(cospi_24_64), [input] "r"(input),
+ [stride] "r"(stride));
+
+ input += 4;
+ }
+}
+
+void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
+ DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ // Rows
+ vpx_idct4_rows_dspr2(input, outptr);
+
+ // Columns
+ vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride);
+}
+
+void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
+ int a1, absa1;
+ int r;
+ int32_t out;
+ int t2, vector_a1, vector_a;
+ uint32_t pos = 45;
+ int16_t input_dc = input[0];
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r"(pos));
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
+ __asm__ __volatile__(
+ "addi %[out], %[out], 8 \n\t"
+ "sra %[a1], %[out], 4 \n\t"
+
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__(
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__(
+ "lw %[t2], 0(%[dest]) \n\t"
+ "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ } else if (a1 > 255) {
+ int32_t a11, a12, vector_a11, vector_a12;
+
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ a11 = a1 >> 3;
+ a12 = a1 - (a11 * 7);
+
+ __asm__ __volatile__(
+ "replv.qb %[vector_a11], %[a11] \n\t"
+ "replv.qb %[vector_a12], %[a12] \n\t"
+
+ : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+ : [a11] "r"(a11), [a12] "r"(a12));
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__(
+ "lw %[t2], 4(%[dest]) \n\t"
+ "addu_s.qb %[vector_a], %[t2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a12] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+ [vector_a12] "r"(vector_a12));
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__(
+ "lw %[t2], 0(%[dest]) \n\t"
+ "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ }
+}
+
+void iadst4_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0, x1, x2, x3;
+
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ // 32-bit result is enough for the following multiplications.
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = x0 - x2 + x3;
+
+ x0 = s0 + s3 + s5;
+ x1 = s1 - s4 - s6;
+ x2 = sinpi_3_9 * s7;
+ x3 = s2;
+
+ s0 = x0 + x3;
+ s1 = x1 + x3;
+ s2 = x2;
+ s3 = x0 + x1 - x3;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = dct_const_round_shift(s0);
+ output[1] = dct_const_round_shift(s1);
+ output[2] = dct_const_round_shift(s2);
+ output[3] = dct_const_round_shift(s3);
+}
+#endif // #if HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans8_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans8_dspr2.c
new file mode 100644
index 0000000000..d4d246965c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans8_dspr2.c
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ const int const_2_power_13 = 8192;
+ int Temp0, Temp1, Temp2, Temp3, Temp4;
+ int i;
+
+ for (i = no_rows; i--;) {
+ __asm__ __volatile__(
+ /*
+ temp_1 = (input[0] + input[4]) * cospi_16_64;
+ step2_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[4]) * cospi_16_64;
+ step2_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 8(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "extp %[Temp4], $ac0, 31 \n\t"
+
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+ step2_2 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 4(%[input]) \n\t"
+ "lh %[Temp1], 12(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "extp %[Temp3], $ac0, 31 \n\t"
+
+ /*
+ step1_1 = step2_1 + step2_2;
+ step1_2 = step2_1 - step2_2;
+ */
+ "add %[step1_1], %[Temp2], %[Temp3] \n\t"
+ "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
+
+ /*
+ temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+ step2_3 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ /*
+ step1_0 = step2_0 + step2_3;
+ step1_3 = step2_0 - step2_3;
+ */
+ "add %[step1_0], %[Temp4], %[Temp1] \n\t"
+ "sub %[step1_3], %[Temp4], %[Temp1] \n\t"
+
+ /*
+ temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ step1_4 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp1], 14(%[input]) \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
+ "extp %[step1_4], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1_7 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
+ "extp %[step1_7], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ step1_5 = dct_const_round_shift(temp_1);
+ */
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
+ "extp %[step1_5], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1_6 = dct_const_round_shift(temp_2);
+ */
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+ temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+ */
+ "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
+ "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
+ "add %[Temp0], %[Temp0], %[step1_5] \n\t"
+ "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
+ "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
+ "add %[Temp1], %[Temp1], %[step1_7] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
+
+ /*
+ step1_4 = step1_4 + step1_5;
+ step1_7 = step1_6 + step1_7;
+ */
+ "add %[step1_4], %[step1_4], %[step1_5] \n\t"
+ "add %[step1_7], %[step1_7], %[step1_6] \n\t"
+
+ "extp %[step1_5], $ac0, 31 \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ "add %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "sh %[Temp0], 0(%[output]) \n\t"
+ "add %[Temp1], %[step1_1], %[step1_6] \n\t"
+ "sh %[Temp1], 16(%[output]) \n\t"
+ "add %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "sh %[Temp0], 32(%[output]) \n\t"
+ "add %[Temp1], %[step1_3], %[step1_4] \n\t"
+ "sh %[Temp1], 48(%[output]) \n\t"
+
+ "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "sh %[Temp0], 64(%[output]) \n\t"
+ "sub %[Temp1], %[step1_2], %[step1_5] \n\t"
+ "sh %[Temp1], 80(%[output]) \n\t"
+ "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "sh %[Temp0], 96(%[output]) \n\t"
+ "sub %[Temp1], %[step1_0], %[step1_7] \n\t"
+ "sh %[Temp1], 112(%[output]) \n\t"
+
+ : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
+ [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
+ [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
+ [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
+ [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+ : [const_2_power_13] "r"(const_2_power_13),
+ [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_24_64] "r"(cospi_24_64), [output] "r"(output),
+ [input] "r"(input));
+
+ input += 8;
+ output += 1;
+ }
+}
+
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int Temp0, Temp1, Temp2, Temp3;
+ int i;
+ const int const_2_power_13 = 8192;
+ const int const_255 = 255;
+ uint8_t *dest_pix;
+
+ for (i = 0; i < 8; ++i) {
+ dest_pix = (dest + i);
+
+ __asm__ __volatile__(
+ /*
+ temp_1 = (input[0] + input[4]) * cospi_16_64;
+ step2_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[4]) * cospi_16_64;
+ step2_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 8(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "extp %[step1_6], $ac0, 31 \n\t"
+
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+ step2_2 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 4(%[input]) \n\t"
+ "lh %[Temp1], 12(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "extp %[Temp3], $ac0, 31 \n\t"
+
+ /*
+ step1_1 = step2_1 + step2_2;
+ step1_2 = step2_1 - step2_2;
+ */
+ "add %[step1_1], %[Temp2], %[Temp3] \n\t"
+ "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
+
+ /*
+ temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+ step2_3 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ /*
+ step1_0 = step2_0 + step2_3;
+ step1_3 = step2_0 - step2_3;
+ */
+ "add %[step1_0], %[step1_6], %[Temp1] \n\t"
+ "sub %[step1_3], %[step1_6], %[Temp1] \n\t"
+
+ /*
+ temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ step1_4 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp1], 14(%[input]) \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
+ "extp %[step1_4], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1_7 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
+ "extp %[step1_7], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ step1_5 = dct_const_round_shift(temp_1);
+ */
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
+ "extp %[step1_5], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1_6 = dct_const_round_shift(temp_2);
+ */
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+ temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+ */
+ "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
+ "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
+ "add %[Temp0], %[Temp0], %[step1_5] \n\t"
+ "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
+ "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
+ "add %[Temp1], %[Temp1], %[step1_7] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
+
+ /*
+ step1_4 = step1_4 + step1_5;
+ step1_7 = step1_6 + step1_7;
+ */
+ "add %[step1_4], %[step1_4], %[step1_5] \n\t"
+ "add %[step1_7], %[step1_7], %[step1_6] \n\t"
+
+ "extp %[step1_5], $ac0, 31 \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /* add block */
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+
+ : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
+ [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
+ [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
+ [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
+ [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
+ : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255),
+ [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
+ [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
+ [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_24_64] "r"(cospi_24_64), [input] "r"(input),
+ [stride] "r"(stride));
+
+ input += 8;
+ }
+}
+
+void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
+
+ // First transform rows
+ idct8_rows_dspr2(input, outptr, 8);
+
+ // Then transform columns and add to dest
+ idct8_columns_add_blk_dspr2(&out[0], dest, stride);
+}
+
+void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
+
+ // First transform rows
+ idct8_rows_dspr2(input, outptr, 4);
+
+ outptr += 4;
+
+ __asm__ __volatile__(
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 4(%[outptr]) \n\t"
+ "sw $zero, 16(%[outptr]) \n\t"
+ "sw $zero, 20(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 36(%[outptr]) \n\t"
+ "sw $zero, 48(%[outptr]) \n\t"
+ "sw $zero, 52(%[outptr]) \n\t"
+ "sw $zero, 64(%[outptr]) \n\t"
+ "sw $zero, 68(%[outptr]) \n\t"
+ "sw $zero, 80(%[outptr]) \n\t"
+ "sw $zero, 84(%[outptr]) \n\t"
+ "sw $zero, 96(%[outptr]) \n\t"
+ "sw $zero, 100(%[outptr]) \n\t"
+ "sw $zero, 112(%[outptr]) \n\t"
+ "sw $zero, 116(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r"(outptr));
+
+ // Then transform columns and add to dest
+ idct8_columns_add_blk_dspr2(&out[0], dest, stride);
+}
+
+void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
+ uint32_t pos = 45;
+ int32_t out;
+ int32_t r;
+ int32_t a1, absa1;
+ int32_t t1, t2, vector_a1, vector_1, vector_2;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r"(pos));
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__(
+ "addi %[out], %[out], 16 \n\t"
+ "sra %[a1], %[out], 5 \n\t"
+
+ : [out] "+r"(out), [a1] "=r"(a1)
+ :);
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__(
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+ [vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ } else if (a1 > 255) {
+ int32_t a11, a12, vector_a11, vector_a12;
+
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ a11 = a1 >> 2;
+ a12 = a1 - (a11 * 3);
+
+ __asm__ __volatile__(
+ "replv.qb %[vector_a11], %[a11] \n\t"
+ "replv.qb %[vector_a12], %[a12] \n\t"
+
+ : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+ : [a11] "r"(a11), [a12] "r"(a12));
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+ [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
+ : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+ [vector_a12] "r"(vector_a12));
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+ [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ }
+}
+
+void iadst8_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0, x1, x2, x3, x4, x5, x6, x7;
+
+ x0 = input[7];
+ x1 = input[0];
+ x2 = input[5];
+ x3 = input[2];
+ x4 = input[3];
+ x5 = input[4];
+ x6 = input[1];
+ x7 = input[6];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+ output[6] = output[7] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
+ x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
+ x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
+ x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
+ x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
+ x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
+ x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
+ x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
+
+ output[0] = x0;
+ output[1] = -x4;
+ output[2] = x6;
+ output[3] = -x2;
+ output[4] = x3;
+ output[5] = -x7;
+ output[6] = x5;
+ output[7] = -x1;
+}
+#endif // HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
new file mode 100644
index 0000000000..b1731f2345
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
@@ -0,0 +1,1489 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
+#include "vpx_ports/mem.h"
+
+static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
+ uint8_t *filter48,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v16u8 flat, mask, hev, thresh, b_limit, limit;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ v16u8 zero = { 0 };
+
+ /* load vector elements */
+ LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__msa_test_bz_v(flat)) {
+ ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+
+ return 1;
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+ ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+ p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+ p0_filt8_r, q0_filt8_r);
+ PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+ q2_filt8_r);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+ ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+ filter48 += (4 * 16);
+ ST_UB2(q1_out, q2_out, filter48, 16);
+ filter48 += (2 * 16);
+ ST_UB(flat, filter48);
+
+ return 0;
+ }
+}
+
+static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+ v16u8 flat, flat2, filter8;
+ v16i8 zero = { 0 };
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+ v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+ v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+ v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+ v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+ v8i16 l_out, r_out;
+
+ flat = LD_UB(filter48 + 96);
+
+ LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+ LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+ VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__msa_test_bz_v(flat2)) {
+ LD_UB4(filter48, 16, p2, p1, p0, q0);
+ LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+ src -= 3 * pitch;
+ ST_UB4(p2, p1, p0, q0, src, pitch);
+ src += (4 * pitch);
+ ST_UB2(q1, q2, src, pitch);
+ } else {
+ src -= 7 * pitch;
+
+ ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+ p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+ p2_r_in, p1_r_in, p0_r_in);
+
+ q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+ tmp0_r = p7_r_in << 3;
+ tmp0_r -= p7_r_in;
+ tmp0_r += p6_r_in;
+ tmp0_r += q0_r_in;
+ tmp1_r = p6_r_in + p5_r_in;
+ tmp1_r += p4_r_in;
+ tmp1_r += p3_r_in;
+ tmp1_r += p2_r_in;
+ tmp1_r += p1_r_in;
+ tmp1_r += p0_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+ p5_l_in, p4_l_in);
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+ p1_l_in, p0_l_in);
+ q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+ tmp0_l = p7_l_in << 3;
+ tmp0_l -= p7_l_in;
+ tmp0_l += p6_l_in;
+ tmp0_l += q0_l_in;
+ tmp1_l = p6_l_in + p5_l_in;
+ tmp1_l += p4_l_in;
+ tmp1_l += p3_l_in;
+ tmp1_l += p2_l_in;
+ tmp1_l += p1_l_in;
+ tmp1_l += p0_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+ ST_UB(p6, src);
+ src += pitch;
+
+ /* p5 */
+ q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+ tmp0_r = p5_r_in - p6_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+ tmp0_l = p5_l_in - p6_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+ ST_UB(p5, src);
+ src += pitch;
+
+ /* p4 */
+ q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+ tmp0_r = p4_r_in - p5_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
+
+ q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+ tmp0_l = p4_l_in - p5_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+ ST_UB(p4, src);
+ src += pitch;
+
+ /* p3 */
+ q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+ tmp0_r = p3_r_in - p4_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+ tmp0_l = p3_l_in - p4_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+ ST_UB(p3, src);
+ src += pitch;
+
+ /* p2 */
+ q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+ filter8 = LD_UB(filter48);
+ tmp0_r = p2_r_in - p3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+ tmp0_l = p2_l_in - p3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* p1 */
+ q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+ filter8 = LD_UB(filter48 + 16);
+ tmp0_r = p1_r_in - p2_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+ tmp0_l = p1_l_in - p2_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* p0 */
+ q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+ filter8 = LD_UB(filter48 + 32);
+ tmp0_r = p0_r_in - p1_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+ tmp0_l = p0_l_in - p1_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* q0 */
+ q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+ filter8 = LD_UB(filter48 + 48);
+ tmp0_r = q7_r_in - p0_r_in;
+ tmp0_r += q0_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+ tmp0_l = q7_l_in - p0_l_in;
+ tmp0_l += q0_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* q1 */
+ filter8 = LD_UB(filter48 + 64);
+ tmp0_r = q7_r_in - q0_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p6_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q0_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p6_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* q2 */
+ filter8 = LD_UB(filter48 + 80);
+ tmp0_r = q7_r_in - q1_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p5_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q1_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p5_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += pitch;
+
+ /* q3 */
+ tmp0_r = q7_r_in - q2_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p4_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q2_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p4_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+ ST_UB(q3, src);
+ src += pitch;
+
+ /* q4 */
+ tmp0_r = q7_r_in - q3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p3_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p3_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+ ST_UB(q4, src);
+ src += pitch;
+
+ /* q5 */
+ tmp0_r = q7_r_in - q4_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p2_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q4_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p2_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+ ST_UB(q5, src);
+ src += pitch;
+
+ /* q6 */
+ tmp0_r = q7_r_in - q5_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p1_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ tmp0_l = q7_l_in - q5_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p1_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+ ST_UB(q6, src);
+ }
+}
+
+static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr,
+ int32_t count) {
+ DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
+ uint8_t early_exit = 0;
+
+ (void)count;
+
+ early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+ limit_ptr, thresh_ptr);
+
+ if (0 == early_exit) {
+ hz_lpf_t16_16w(src, pitch, filter48);
+ }
+}
+
+static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr, int32_t count) {
+ if (1 == count) {
+ uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+ uint64_t dword0, dword1;
+ v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v16u8 p0_filter16, p1_filter16;
+ v8i16 p2_filter8, p1_filter8, p0_filter8;
+ v8i16 q0_filter8, q1_filter8, q2_filter8;
+ v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+ v16i8 zero = { 0 };
+ v8u16 tmp0, tmp1, tmp2;
+
+ /* load vector elements */
+ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+ q1_out);
+
+ flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+ if (__msa_test_bz_v(flat)) {
+ p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+ SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
+ } else {
+ /* convert 8 bit input data into 16 bit */
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+ zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+ q3_r);
+ VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+ p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+ q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
+ PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+ /* load 16 vector elements */
+ LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
+ LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
+
+ VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__msa_test_bz_v(flat2)) {
+ p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+ p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+ q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+ SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
+ SD(q1_d, src + pitch);
+ SD(q2_d, src + 2 * pitch);
+ } else {
+ /* LSB(right) 8 pixel operation */
+ ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
+ zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
+ q7_r);
+
+ tmp0 = p7_r << 3;
+ tmp0 -= p7_r;
+ tmp0 += p6_r;
+ tmp0 += q0_r;
+
+ src -= 7 * pitch;
+
+ /* calculation of p6 and p5 */
+ tmp1 = p6_r + p5_r + p4_r + p3_r;
+ tmp1 += (p2_r + p1_r + p0_r);
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp0 = p5_r - p6_r + q1_r - p7_r;
+ tmp1 += tmp0;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of p4 and p3 */
+ tmp0 = p4_r - p5_r + q2_r - p7_r;
+ tmp2 = p3_r - p4_r + q3_r - p7_r;
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of p2 and p1 */
+ tmp0 = p2_r - p3_r + q4_r - p7_r;
+ tmp2 = p1_r - p2_r + q5_r - p7_r;
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of p0 and q0 */
+ tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
+ tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of q1 and q2 */
+ tmp0 = q7_r - q0_r + q1_r - p6_r;
+ tmp2 = q7_r - q1_r + q2_r - p5_r;
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of q3 and q4 */
+ tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
+ tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ src += pitch;
+
+ /* calculation of q5 and q6 */
+ tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
+ tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
+ tmp1 += tmp0;
+ p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ tmp1 += tmp2;
+ p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+ PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+ p1_filter16);
+ p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
+ p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
+ dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+ dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+ SD(dword0, src);
+ src += pitch;
+ SD(dword1, src);
+ }
+ }
+ } else {
+ mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
+ count);
+ }
+}
+
+void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
+}
+
+void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
+static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+ uint8_t *output, int32_t out_pitch) {
+ v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
+ v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+ LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
+ p1_org, p0_org);
+ /* 8x8 transpose */
+ TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
+ p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
+ /* 8x8 transpose */
+ ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
+ tmp0, tmp1, tmp2, tmp3);
+ ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
+ ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
+ ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
+ ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
+ SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
+
+ ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+ output += (8 * out_pitch);
+ ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+ uint8_t *output, int32_t out_pitch) {
+ v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+ LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+ LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+ TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
+ q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
+ ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
+ int32_t out_pitch) {
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+ v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+ v4i32 tmp2, tmp3;
+
+ LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+ input += (8 * in_pitch);
+ LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
+
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p7, p6,
+ p5, p4, p3, p2, p1, p0);
+
+ /* transpose 16x8 matrix into 8x16 */
+ /* total 8 intermediate register and 32 instructions */
+ q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
+ q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
+ q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
+ q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
+ q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
+ q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
+ q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
+ q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
+
+ ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
+ tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
+ tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
+
+ ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
+ tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
+ tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
+
+ ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
+ q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+ q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+ tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
+ tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
+ q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+ q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+ ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
+ q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+ q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+ tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
+ tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
+ q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+ q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+ ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+ output += (8 * out_pitch);
+ ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+ uint8_t *src_org, int32_t pitch_org,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v16u8 flat, mask, hev, thresh, b_limit, limit;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v16i8 zero = { 0 };
+ v8i16 vec0, vec1, vec2, vec3;
+
+ /* load vector elements */
+ LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+ if (__msa_test_bz_v(flat)) {
+ ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+ ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
+ return 1;
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+ /* convert 16 bit output data into 8 bit */
+ p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
+ p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
+ p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
+ q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
+ q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
+ q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
+
+ ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+ filter48 += (4 * 16);
+ ST_UB2(q1_out, q2_out, filter48, 16);
+ filter48 += (2 * 16);
+ ST_UB(flat, filter48);
+
+ return 0;
+ }
+}
+
+static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+ uint8_t *filter48) {
+ v16i8 zero = { 0 };
+ v16u8 filter8, flat, flat2;
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+ v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+ v8u16 tmp0_r, tmp1_r;
+ v8i16 r_out;
+
+ flat = LD_UB(filter48 + 6 * 16);
+
+ LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+ LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+ VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__msa_test_bz_v(flat2)) {
+ v8i16 vec0, vec1, vec2, vec3, vec4;
+
+ LD_UB4(filter48, 16, p2, p1, p0, q0);
+ LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+ ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+ vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+ src_org -= 3;
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+ src_org += (4 * pitch);
+ ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+
+ return 1;
+ } else {
+ src -= 7 * 16;
+
+ ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+ p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+ p2_r_in, p1_r_in, p0_r_in);
+ q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+ tmp0_r = p7_r_in << 3;
+ tmp0_r -= p7_r_in;
+ tmp0_r += p6_r_in;
+ tmp0_r += q0_r_in;
+ tmp1_r = p6_r_in + p5_r_in;
+ tmp1_r += p4_r_in;
+ tmp1_r += p3_r_in;
+ tmp1_r += p2_r_in;
+ tmp1_r += p1_r_in;
+ tmp1_r += p0_r_in;
+ tmp1_r += tmp0_r;
+
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+ ST8x1_UB(p6, src);
+ src += 16;
+
+ /* p5 */
+ q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+ tmp0_r = p5_r_in - p6_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+ ST8x1_UB(p5, src);
+ src += 16;
+
+ /* p4 */
+ q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+ tmp0_r = p4_r_in - p5_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+ ST8x1_UB(p4, src);
+ src += 16;
+
+ /* p3 */
+ q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+ tmp0_r = p3_r_in - p4_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+ ST8x1_UB(p3, src);
+ src += 16;
+
+ /* p2 */
+ q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+ filter8 = LD_UB(filter48);
+ tmp0_r = p2_r_in - p3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* p1 */
+ q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+ filter8 = LD_UB(filter48 + 16);
+ tmp0_r = p1_r_in - p2_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* p0 */
+ q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+ filter8 = LD_UB(filter48 + 32);
+ tmp0_r = p0_r_in - p1_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* q0 */
+ q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+ filter8 = LD_UB(filter48 + 48);
+ tmp0_r = q7_r_in - p0_r_in;
+ tmp0_r += q0_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* q1 */
+ filter8 = LD_UB(filter48 + 64);
+ tmp0_r = q7_r_in - q0_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p6_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* q2 */
+ filter8 = LD_UB(filter48 + 80);
+ tmp0_r = q7_r_in - q1_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p5_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST8x1_UB(filter8, src);
+ src += 16;
+
+ /* q3 */
+ tmp0_r = q7_r_in - q2_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p4_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+ ST8x1_UB(q3, src);
+ src += 16;
+
+ /* q4 */
+ tmp0_r = q7_r_in - q3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p3_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+ ST8x1_UB(q4, src);
+ src += 16;
+
+ /* q5 */
+ tmp0_r = q7_r_in - q4_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p2_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+ ST8x1_UB(q5, src);
+ src += 16;
+
+ /* q6 */
+ tmp0_r = q7_r_in - q5_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p1_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+ q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+ ST8x1_UB(q6, src);
+
+ return 0;
+ }
+}
+
+void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ uint8_t early_exit = 0;
+ DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+ uint8_t *filter48 = &transposed_input[16 * 16];
+
+ transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+
+ early_exit =
+ vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch,
+ b_limit_ptr, limit_ptr, thresh_ptr);
+
+ if (0 == early_exit) {
+ early_exit =
+ vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
+
+ if (0 == early_exit) {
+ transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+ }
+ }
+}
+
+static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+ uint8_t *src_org, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v16u8 flat, mask, hev, thresh, b_limit, limit;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ v16i8 zero = { 0 };
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+
+ /* load vector elements */
+ LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__msa_test_bz_v(flat)) {
+ ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+ ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+ src_org -= 2;
+ ST4x8_UB(vec2, vec3, src_org, pitch);
+ src_org += 8 * pitch;
+ ST4x8_UB(vec4, vec5, src_org, pitch);
+
+ return 1;
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+ ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+ p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+ p0_filt8_r, q0_filt8_r);
+ PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+ q2_filt8_r);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+ ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+ filter48 += (4 * 16);
+ ST_UB2(q1_out, q2_out, filter48, 16);
+ filter48 += (2 * 16);
+ ST_UB(flat, filter48);
+
+ return 0;
+ }
+}
+
+static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+ uint8_t *filter48) {
+ v16u8 flat, flat2, filter8;
+ v16i8 zero = { 0 };
+ v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+ v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+ v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+ v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+ v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+ v8i16 l_out, r_out;
+
+ flat = LD_UB(filter48 + 6 * 16);
+
+ LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+ LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+ VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__msa_test_bz_v(flat2)) {
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+ LD_UB4(filter48, 16, p2, p1, p0, q0);
+ LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+ ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+ ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+ ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+ src_org -= 3;
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+ src_org += (4 * pitch);
+ ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+ src_org += (4 * pitch);
+ ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec5, 0, (src_org + 4), pitch);
+ src_org += (4 * pitch);
+ ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
+ ST2x4_UB(vec5, 4, (src_org + 4), pitch);
+
+ return 1;
+ } else {
+ src -= 7 * 16;
+
+ ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+ p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+ p2_r_in, p1_r_in, p0_r_in);
+ q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+ tmp0_r = p7_r_in << 3;
+ tmp0_r -= p7_r_in;
+ tmp0_r += p6_r_in;
+ tmp0_r += q0_r_in;
+ tmp1_r = p6_r_in + p5_r_in;
+ tmp1_r += p4_r_in;
+ tmp1_r += p3_r_in;
+ tmp1_r += p2_r_in;
+ tmp1_r += p1_r_in;
+ tmp1_r += p0_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+ ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+ p5_l_in, p4_l_in);
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+ p1_l_in, p0_l_in);
+ q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+ tmp0_l = p7_l_in << 3;
+ tmp0_l -= p7_l_in;
+ tmp0_l += p6_l_in;
+ tmp0_l += q0_l_in;
+ tmp1_l = p6_l_in + p5_l_in;
+ tmp1_l += p4_l_in;
+ tmp1_l += p3_l_in;
+ tmp1_l += p2_l_in;
+ tmp1_l += p1_l_in;
+ tmp1_l += p0_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+ ST_UB(p6, src);
+ src += 16;
+
+ /* p5 */
+ q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+ tmp0_r = p5_r_in - p6_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+ tmp0_l = p5_l_in - p6_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+ ST_UB(p5, src);
+ src += 16;
+
+ /* p4 */
+ q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+ tmp0_r = p4_r_in - p5_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+ tmp0_l = p4_l_in - p5_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+ ST_UB(p4, src);
+ src += 16;
+
+ /* p3 */
+ q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+ tmp0_r = p3_r_in - p4_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+ tmp0_l = p3_l_in - p4_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+ ST_UB(p3, src);
+ src += 16;
+
+ /* p2 */
+ q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+ filter8 = LD_UB(filter48);
+ tmp0_r = p2_r_in - p3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+ tmp0_l = p2_l_in - p3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* p1 */
+ q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+ filter8 = LD_UB(filter48 + 16);
+ tmp0_r = p1_r_in - p2_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+ tmp0_l = p1_l_in - p2_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* p0 */
+ q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+ filter8 = LD_UB(filter48 + 32);
+ tmp0_r = p0_r_in - p1_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+ tmp0_l = p0_l_in - p1_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* q0 */
+ q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+ filter8 = LD_UB(filter48 + 48);
+ tmp0_r = q7_r_in - p0_r_in;
+ tmp0_r += q0_r_in;
+ tmp0_r -= p7_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+ tmp0_l = q7_l_in - p0_l_in;
+ tmp0_l += q0_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* q1 */
+ filter8 = LD_UB(filter48 + 64);
+ tmp0_r = q7_r_in - q0_r_in;
+ tmp0_r += q1_r_in;
+ tmp0_r -= p6_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q0_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p6_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* q2 */
+ filter8 = LD_UB(filter48 + 80);
+ tmp0_r = q7_r_in - q1_r_in;
+ tmp0_r += q2_r_in;
+ tmp0_r -= p5_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q1_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p5_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+ ST_UB(filter8, src);
+ src += 16;
+
+ /* q3 */
+ tmp0_r = q7_r_in - q2_r_in;
+ tmp0_r += q3_r_in;
+ tmp0_r -= p4_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q2_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p4_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+ ST_UB(q3, src);
+ src += 16;
+
+ /* q4 */
+ tmp0_r = q7_r_in - q3_r_in;
+ tmp0_r += q4_r_in;
+ tmp0_r -= p3_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p3_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+ ST_UB(q4, src);
+ src += 16;
+
+ /* q5 */
+ tmp0_r = q7_r_in - q4_r_in;
+ tmp0_r += q5_r_in;
+ tmp0_r -= p2_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q4_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p2_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+ ST_UB(q5, src);
+ src += 16;
+
+ /* q6 */
+ tmp0_r = q7_r_in - q5_r_in;
+ tmp0_r += q6_r_in;
+ tmp0_r -= p1_r_in;
+ tmp1_r += tmp0_r;
+ r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+ tmp0_l = q7_l_in - q5_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p1_l_in;
+ tmp1_l += tmp0_l;
+ l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+ r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+ q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+ ST_UB(q6, src);
+
+ return 0;
+ }
+}
+
+void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ uint8_t early_exit = 0;
+ DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+ uint8_t *filter48 = &transposed_input[16 * 16];
+
+ transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+ early_exit =
+ vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+ pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+ if (0 == early_exit) {
+ early_exit =
+ vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
+
+ if (0 == early_exit) {
+ transpose_16x16(transposed_input, 16, (src - 8), pitch);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
new file mode 100644
index 0000000000..0eff2b6ca9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
+
+void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ uint64_t p1_d, p0_d, q0_d, q1_d;
+ v16u8 mask, hev, flat, thresh, b_limit, limit;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+
+ /* load vector elements */
+ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+ SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+}
+
+void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+ /* load vector elements */
+ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+ thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+ thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+ b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+ b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+ b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+ limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+ limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+ limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+ ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ v16u8 mask, hev, flat, limit, thresh, b_limit;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v8i16 vec0, vec1, vec2, vec3;
+
+ LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+ ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+ src -= 2;
+ ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+ src += 4 * pitch;
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+}
+
+void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ v16u8 mask, hev, flat;
+ v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+ v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+ LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
+ row14, row15);
+
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+ thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+ thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+ b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+ b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+ b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+ limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+ limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+ limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+ ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+ ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+ src -= 2;
+
+ ST4x8_UB(tmp2, tmp3, src, pitch);
+ src += (8 * pitch);
+ ST4x8_UB(tmp4, tmp5, src, pitch);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
new file mode 100644
index 0000000000..703fcce8a7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
+
+void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+ v16u8 mask, hev, flat, thresh, b_limit, limit;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+ v16i8 zero = { 0 };
+
+ /* load vector elements */
+ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+ if (__msa_test_bz_v(flat)) {
+ p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+ SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+ p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+ q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
+ PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+ p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+ p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+ q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+ src -= 3 * pitch;
+
+ SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
+ src += (4 * pitch);
+ SD(q1_d, src);
+ src += pitch;
+ SD(q2_d, src);
+ }
+}
+
+void vpx_lpf_horizontal_8_dual_msa(
+ uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ v16u8 zero = { 0 };
+
+ /* load vector elements */
+ LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh0);
+ tmp = (v16u8)__msa_fill_b(*thresh1);
+ thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);
+
+ b_limit = (v16u8)__msa_fill_b(*b_limit0);
+ tmp = (v16u8)__msa_fill_b(*b_limit1);
+ b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);
+
+ limit = (v16u8)__msa_fill_b(*limit0);
+ tmp = (v16u8)__msa_fill_b(*limit1);
+ limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__msa_test_bz_v(flat)) {
+ ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+ ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+ p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+ p0_filt8_r, q0_filt8_r);
+ PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+ q2_filt8_r);
+
+ /* store pixel values */
+ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+ p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+ p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+ q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+ q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+ q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+ src -= 3 * pitch;
+
+ ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+ src += (4 * pitch);
+ ST_UB2(q1_out, q2_out, src, pitch);
+ src += (2 * pitch);
+ }
+}
+
+void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p1_out, p0_out, q0_out, q1_out;
+ v16u8 flat, mask, hev, thresh, b_limit, limit;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v16u8 zero = { 0 };
+ v8i16 vec0, vec1, vec2, vec3, vec4;
+
+ /* load vector elements */
+ LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+ TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+ if (__msa_test_bz_v(flat)) {
+ /* Store 4 pixels p1-_q1 */
+ ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+ src -= 2;
+ ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+ src += 4 * pitch;
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
+ p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+ p0_filt8_r, q0_filt8_r);
+ PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
+ q2_filt8_r);
+
+ /* store pixel values */
+ p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+ p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+ p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+ q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+ q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+ q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+ /* Store 6 pixels p2-_q2 */
+ ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+ vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+ src -= 3;
+ ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec4, 0, src + 4, pitch);
+ src += (4 * pitch);
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec4, 4, src + 4, pitch);
+ }
+}
+
+void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0, const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *b_limit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8_t *temp_src;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 p1_out, p0_out, q0_out, q1_out;
+ v16u8 flat, mask, hev, thresh, b_limit, limit;
+ v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+ v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+ v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+ v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ v16u8 zero = { 0 };
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+ temp_src = src - 4;
+
+ LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+ temp_src += (8 * pitch);
+ LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+ /* transpose 16x8 matrix into 8x16 */
+ TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+ row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+
+ thresh = (v16u8)__msa_fill_b(*thresh0);
+ vec0 = (v8i16)__msa_fill_b(*thresh1);
+ thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);
+
+ b_limit = (v16u8)__msa_fill_b(*b_limit0);
+ vec0 = (v8i16)__msa_fill_b(*b_limit1);
+ b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);
+
+ limit = (v16u8)__msa_fill_b(*limit0);
+ vec0 = (v8i16)__msa_fill_b(*limit1);
+ limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__msa_test_bz_v(flat)) {
+ ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+ ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+ src -= 2;
+ ST4x8_UB(vec2, vec3, src, pitch);
+ src += 8 * pitch;
+ ST4x8_UB(vec4, vec5, src, pitch);
+ } else {
+ ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+ q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+ VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+ p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+ ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+ ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+
+ /* filter8 */
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ /* convert 16 bit output data into 8 bit */
+ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+ p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+ p0_filt8_r, q0_filt8_r);
+ PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+ q2_filt8_r);
+
+ /* store pixel values */
+ p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+ p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+ p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+ q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+ q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+ q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+ ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+ ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+ ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+ ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+ src -= 3;
+ ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec2, 0, src + 4, pitch);
+ src += (4 * pitch);
+ ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec2, 4, src + 4, pitch);
+ src += (4 * pitch);
+ ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec5, 0, src + 4, pitch);
+ src += (4 * pitch);
+ ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+ ST2x4_UB(vec5, 4, src + 4, pitch);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c
new file mode 100644
index 0000000000..f1743679a7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint8_t i;
+ uint32_t mask;
+ uint32_t hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ /* prefetch data for store */
+ prefetch_store(s);
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ of 8 bit simd instructions. */
+ for (i = 0; i < 2; i++) {
+ sm1 = s - (pitch << 2);
+ s0 = sm1 + pitch;
+ s1 = s0 + pitch;
+ s2 = s - pitch;
+ s3 = s;
+ s4 = s + pitch;
+ s5 = s4 + pitch;
+ s6 = s5 + pitch;
+
+ __asm__ __volatile__(
+ "lw %[p1], (%[s1]) \n\t"
+ "lw %[p2], (%[s2]) \n\t"
+ "lw %[p3], (%[s3]) \n\t"
+ "lw %[p4], (%[s4]) \n\t"
+
+ : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ mask will be zero and filtering is not needed */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ __asm__ __volatile__(
+ "lw %[pm1], (%[sm1]) \n\t"
+ "lw %[p0], (%[s0]) \n\t"
+ "lw %[p5], (%[s5]) \n\t"
+ "lw %[p6], (%[s6]) \n\t"
+
+ : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
+ : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
+
+ filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+ p6, thresh_vec, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+ __asm__ __volatile__(
+ "sw %[p1], (%[s1]) \n\t"
+ "sw %[p2], (%[s2]) \n\t"
+ "sw %[p3], (%[s3]) \n\t"
+ "sw %[p4], (%[s4]) \n\t"
+
+ :
+ : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
+ [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+ }
+ }
+
+ s = s + 4;
+ }
+}
+
+void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint8_t i;
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ /* prefetch data for store */
+ prefetch_store(s + pitch);
+
+ for (i = 0; i < 2; i++) {
+ s1 = s;
+ s2 = s + pitch;
+ s3 = s2 + pitch;
+ s4 = s3 + pitch;
+ s = s4 + pitch;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+ [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+ p6, thresh_vec, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood
+ * don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [s4] "r"(s4));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+
+ : [p1] "+r"(p1)
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [s2] "r"(s2));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [s1] "r"(s1));
+ }
+ }
+ }
+}
+
+void vpx_lpf_horizontal_4_dual_dspr2(
+ uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
+ vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
+ vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void vpx_lpf_horizontal_8_dual_dspr2(
+ uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
+ vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
+ vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
+ vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
+ vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
+ vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif // #if HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
new file mode 100644
index 0000000000..ec339be868
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* inputs & outputs are quad-byte vectors */
+static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
+ uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
+ int32_t vpx_filter_l, vpx_filter_r;
+ int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+ int32_t subr_r, subr_l;
+ uint32_t t1, t2, HWM, t3;
+ uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+ int32_t vps1, vps0, vqs0, vqs1;
+ int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+ uint32_t N128;
+
+ N128 = 0x80808080;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+ t3 = 0x01000100;
+ HWM = 0xFF00FF00;
+
+ vps0 = (*ps0) ^ N128;
+ vps1 = (*ps1) ^ N128;
+ vqs0 = (*qs0) ^ N128;
+ vqs1 = (*qs1) ^ N128;
+
+ /* use halfword pairs instead quad-bytes because of accuracy */
+ vps0_l = vps0 & HWM;
+ vps0_r = vps0 << 8;
+ vps0_r = vps0_r & HWM;
+
+ vps1_l = vps1 & HWM;
+ vps1_r = vps1 << 8;
+ vps1_r = vps1_r & HWM;
+
+ vqs0_l = vqs0 & HWM;
+ vqs0_r = vqs0 << 8;
+ vqs0_r = vqs0_r & HWM;
+
+ vqs1_l = vqs1 & HWM;
+ vqs1_r = vqs1 << 8;
+ vqs1_r = vqs1_r & HWM;
+
+ mask_l = mask & HWM;
+ mask_r = mask << 8;
+ mask_r = mask_r & HWM;
+
+ hev_l = hev & HWM;
+ hev_r = hev << 8;
+ hev_r = hev_r & HWM;
+
+ __asm__ __volatile__(
+ /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+ "subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t"
+ "subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t"
+
+ /* qs0 - ps0 */
+ "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
+ "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
+
+ /* vpx_filter &= hev; */
+ "and %[vpx_filter_l], %[vpx_filter_l], %[hev_l] \n\t"
+ "and %[vpx_filter_r], %[vpx_filter_r], %[hev_r] \n\t"
+
+ /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+ "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
+ "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
+ "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
+
+ /* vpx_filter &= mask; */
+ "and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t"
+ "and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t"
+
+ : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
+ [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+ [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+ : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+ [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+ [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+ [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+ [HWM] "r"(HWM));
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ __asm__ __volatile__(
+ /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+ "addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t"
+ "addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t"
+
+ /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+ "addq_s.ph %[Filter2_l], %[vpx_filter_l], %[t1] \n\t"
+ "addq_s.ph %[Filter2_r], %[vpx_filter_r], %[t1] \n\t"
+ "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
+ "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
+
+ "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
+
+ /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
+
+ /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+ [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+ [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+ [vqs0_r] "+r"(vqs0_r)
+ : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+ [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
+
+ __asm__ __volatile__(
+ /* (vpx_filter += 1) >>= 1 */
+ "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
+ "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
+
+ /* vpx_filter &= ~hev; */
+ "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
+
+ /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
+ "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
+ "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
+
+ /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
+ "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+ [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+ [vqs1_r] "+r"(vqs1_r)
+ : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
+
+ /* Create quad-bytes from halfword pairs */
+ vqs0_l = vqs0_l & HWM;
+ vqs1_l = vqs1_l & HWM;
+ vps0_l = vps0_l & HWM;
+ vps1_l = vps1_l & HWM;
+
+ __asm__ __volatile__(
+ "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
+ "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
+ "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
+ "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
+
+ : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+ [vqs0_r] "+r"(vqs0_r)
+ :);
+
+ vqs0 = vqs0_l | vqs0_r;
+ vqs1 = vqs1_l | vqs1_r;
+ vps0 = vps0_l | vps0_r;
+ vps1 = vps1_l | vps1_r;
+
+ *ps0 = vps0 ^ N128;
+ *ps1 = vps1 ^ N128;
+ *qs0 = vqs0 ^ N128;
+ *qs1 = vqs1 ^ N128;
+}
+
+static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
+ uint32_t ps0, uint32_t qs0, uint32_t qs1,
+ uint32_t *p1_f0, uint32_t *p0_f0,
+ uint32_t *q0_f0, uint32_t *q1_f0) {
+ int32_t vpx_filter_l, vpx_filter_r;
+ int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+ int32_t subr_r, subr_l;
+ uint32_t t1, t2, HWM, t3;
+ uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+ int32_t vps1, vps0, vqs0, vqs1;
+ int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+ uint32_t N128;
+
+ N128 = 0x80808080;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+ t3 = 0x01000100;
+ HWM = 0xFF00FF00;
+
+ vps0 = (ps0) ^ N128;
+ vps1 = (ps1) ^ N128;
+ vqs0 = (qs0) ^ N128;
+ vqs1 = (qs1) ^ N128;
+
+ /* use halfword pairs instead quad-bytes because of accuracy */
+ vps0_l = vps0 & HWM;
+ vps0_r = vps0 << 8;
+ vps0_r = vps0_r & HWM;
+
+ vps1_l = vps1 & HWM;
+ vps1_r = vps1 << 8;
+ vps1_r = vps1_r & HWM;
+
+ vqs0_l = vqs0 & HWM;
+ vqs0_r = vqs0 << 8;
+ vqs0_r = vqs0_r & HWM;
+
+ vqs1_l = vqs1 & HWM;
+ vqs1_r = vqs1 << 8;
+ vqs1_r = vqs1_r & HWM;
+
+ mask_l = mask & HWM;
+ mask_r = mask << 8;
+ mask_r = mask_r & HWM;
+
+ hev_l = hev & HWM;
+ hev_r = hev << 8;
+ hev_r = hev_r & HWM;
+
+ __asm__ __volatile__(
+ /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+ "subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t"
+ "subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t"
+
+ /* qs0 - ps0 */
+ "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
+ "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
+
+ /* vpx_filter &= hev; */
+ "and %[vpx_filter_l], %[vpx_filter_l], %[hev_l] \n\t"
+ "and %[vpx_filter_r], %[vpx_filter_r], %[hev_r] \n\t"
+
+ /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+ "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
+ "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
+ "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
+
+ /* vpx_filter &= mask; */
+ "and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t"
+ "and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t"
+
+ : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
+ [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+ [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+ : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+ [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+ [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+ [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+ [HWM] "r"(HWM));
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ __asm__ __volatile__(
+ /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+ "addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t"
+ "addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t"
+
+ /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+ "addq_s.ph %[Filter2_l], %[vpx_filter_l], %[t1] \n\t"
+ "addq_s.ph %[Filter2_r], %[vpx_filter_r], %[t1] \n\t"
+ "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
+ "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
+
+ "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
+
+ /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
+
+ /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+ [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+ [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+ [vqs0_r] "+r"(vqs0_r)
+ : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+ [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
+
+ __asm__ __volatile__(
+ /* (vpx_filter += 1) >>= 1 */
+ "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
+ "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
+
+ /* vpx_filter &= ~hev; */
+ "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
+
+ /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
+ "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
+ "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
+
+ /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
+ "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+ [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+ [vqs1_r] "+r"(vqs1_r)
+ : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
+
+ /* Create quad-bytes from halfword pairs */
+ vqs0_l = vqs0_l & HWM;
+ vqs1_l = vqs1_l & HWM;
+ vps0_l = vps0_l & HWM;
+ vps1_l = vps1_l & HWM;
+
+ __asm__ __volatile__(
+ "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
+ "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
+ "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
+ "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
+
+ : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+ [vqs0_r] "+r"(vqs0_r)
+ :);
+
+ vqs0 = vqs0_l | vqs0_r;
+ vqs1 = vqs1_l | vqs1_r;
+ vps0 = vps0_l | vps0_r;
+ vps1 = vps1_l | vps1_r;
+
+ *p0_f0 = vps0 ^ N128;
+ *p1_f0 = vps1 ^ N128;
+ *q0_f0 = vqs0 ^ N128;
+ *q1_f0 = vqs1 ^ N128;
+}
+
+static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
+ uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
+ uint32_t *oq2, uint32_t *oq3) {
+ /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+ const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+ uint32_t res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2;
+ uint32_t tmp;
+ uint32_t add_p210_q012;
+ uint32_t u32Four = 0x00040004;
+
+ /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
+ /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
+ /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
+ /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
+ /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
+ /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
+
+ __asm__ __volatile__(
+ "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
+
+ "shll.ph %[tmp], %[p3], 1 \n\t"
+ "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
+ "addu.ph %[res_op1], %[p3], %[p3] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
+ "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
+ "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
+ "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
+ "addu.ph %[res_op0], %[p3], %[p0] \n\t"
+ "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
+ "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
+ "shll.ph %[tmp], %[q3], 1 \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
+ "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
+ "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
+ "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
+ "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
+ "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
+ "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
+
+ : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+ [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+ [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+ [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+ [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
+
+ *op2 = res_op2;
+ *op1 = res_op1;
+ *op0 = res_op0;
+ *oq0 = res_oq0;
+ *oq1 = res_oq1;
+ *oq2 = res_oq2;
+}
+
+static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
+ uint32_t p0, uint32_t q0, uint32_t q1,
+ uint32_t q2, uint32_t q3, uint32_t *op2_f1,
+ uint32_t *op1_f1, uint32_t *op0_f1,
+ uint32_t *oq0_f1, uint32_t *oq1_f1,
+ uint32_t *oq2_f1) {
+ /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+ uint32_t res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2;
+ uint32_t tmp;
+ uint32_t add_p210_q012;
+ uint32_t u32Four = 0x00040004;
+
+ /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
+ /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
+ /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
+ /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
+ /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
+ /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
+
+ __asm__ __volatile__(
+ "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
+
+ "shll.ph %[tmp], %[p3], 1 \n\t"
+ "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
+ "addu.ph %[res_op1], %[p3], %[p3] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
+ "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
+ "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
+ "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
+ "addu.ph %[res_op0], %[p3], %[p0] \n\t"
+ "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
+ "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
+ "shll.ph %[tmp], %[q3], 1 \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
+ "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
+ "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
+ "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
+ "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
+ "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
+ "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
+
+ : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+ [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+ [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+ [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+ [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
+
+ *op2_f1 = res_op2;
+ *op1_f1 = res_op1;
+ *op0_f1 = res_op0;
+ *oq0_f1 = res_oq0;
+ *oq1_f1 = res_oq1;
+ *oq2_f1 = res_oq2;
+}
+
+static INLINE void wide_mbfilter_dspr2(
+ uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
+ uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
+ uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
+ uint32_t *oq7) {
+ const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
+ const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+ const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+ uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
+ uint32_t tmp;
+ uint32_t add_p6toq6;
+ uint32_t u32Eight = 0x00080008;
+
+ __asm__ __volatile__(
+ /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
+ which is used most of the time */
+ "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t"
+
+ : [add_p6toq6] "=&r"(add_p6toq6)
+ : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
+ [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
+ [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+ [u32Eight] "r"(u32Eight));
+
+ __asm__ __volatile__(
+ /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
+ p3 + p2 + p1 + p0 + q0, 4) */
+ "shll.ph %[tmp], %[p7], 3 \n\t"
+ "subu.ph %[res_op6], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op6], %[res_op6], %[p6] \n\t"
+ "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q1] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q2] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q3] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q4] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q5] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q6] \n\t"
+ "shrl.ph %[res_op6], %[res_op6], 4 \n\t"
+
+ /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
+ p2 + p1 + p0 + q0 + q1, 4) */
+ "shll.ph %[tmp], %[p7], 2 \n\t"
+ "addu.ph %[res_op5], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op5], %[res_op5], %[p7] \n\t"
+ "addu.ph %[res_op5], %[res_op5], %[p5] \n\t"
+ "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q2] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q3] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q4] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q5] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q6] \n\t"
+ "shrl.ph %[res_op5], %[res_op5], 4 \n\t"
+
+ /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
+ p1 + p0 + q0 + q1 + q2, 4) */
+ "shll.ph %[tmp], %[p7], 2 \n\t"
+ "addu.ph %[res_op4], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op4], %[res_op4], %[p4] \n\t"
+ "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q3] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q4] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q5] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q6] \n\t"
+ "shrl.ph %[res_op4], %[res_op4], 4 \n\t"
+
+ /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
+ p1 + p0 + q0 + q1 + q2 + q3, 4) */
+ "shll.ph %[tmp], %[p7], 2 \n\t"
+ "addu.ph %[res_op3], %[tmp], %[p3] \n\t"
+ "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op3], %[res_op3], %[q4] \n\t"
+ "subu.ph %[res_op3], %[res_op3], %[q5] \n\t"
+ "subu.ph %[res_op3], %[res_op3], %[q6] \n\t"
+ "shrl.ph %[res_op3], %[res_op3], 4 \n\t"
+
+ /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
+ p0 + q0 + q1 + q2 + q3 + q4, 4) */
+ "shll.ph %[tmp], %[p7], 1 \n\t"
+ "addu.ph %[res_op2], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q5] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q6] \n\t"
+ "shrl.ph %[res_op2], %[res_op2], 4 \n\t"
+
+ /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+ p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
+ "shll.ph %[tmp], %[p7], 1 \n\t"
+ "addu.ph %[res_op1], %[tmp], %[p1] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op1], %[res_op1], %[q6] \n\t"
+ "shrl.ph %[res_op1], %[res_op1], 4 \n\t"
+
+ /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+ q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
+ "addu.ph %[res_op0], %[p7], %[p0] \n\t"
+ "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t"
+ "shrl.ph %[res_op0], %[res_op0], 4 \n\t"
+
+ : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
+ [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
+ [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+ [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
+ : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
+ [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+ [add_p6toq6] "r"(add_p6toq6));
+
+ *op6 = res_op6;
+ *op5 = res_op5;
+ *op4 = res_op4;
+ *op3 = res_op3;
+ *op2 = res_op2;
+ *op1 = res_op1;
+ *op0 = res_op0;
+
+ __asm__ __volatile__(
+ /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+ q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
+ "addu.ph %[res_oq0], %[q7], %[q0] \n\t"
+ "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t"
+ "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t"
+
+ /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+ q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
+ "shll.ph %[tmp], %[q7], 1 \n\t"
+ "addu.ph %[res_oq1], %[tmp], %[q1] \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t"
+ "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t"
+
+ /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+ q3 + q4 + q5 + q6 + q7 * 3, 4) */
+ "shll.ph %[tmp], %[q7], 1 \n\t"
+ "addu.ph %[res_oq2], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t"
+ "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t"
+
+ /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
+ q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
+ "shll.ph %[tmp], %[q7], 2 \n\t"
+ "addu.ph %[res_oq3], %[tmp], %[q3] \n\t"
+ "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t"
+ "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t"
+ "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t"
+ "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t"
+
+ /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
+ q4 * 2 + q5 + q6 + q7 * 5, 4) */
+ "shll.ph %[tmp], %[q7], 2 \n\t"
+ "addu.ph %[res_oq4], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t"
+ "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t"
+ "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t"
+
+ /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
+ q5 * 2 + q6 + q7 * 6, 4) */
+ "shll.ph %[tmp], %[q7], 2 \n\t"
+ "addu.ph %[res_oq5], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t"
+ "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t"
+ "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t"
+ "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t"
+
+ /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
+ q4 + q5 + q6 * 2 + q7 * 7, 4) */
+ "shll.ph %[tmp], %[q7], 3 \n\t"
+ "subu.ph %[res_oq6], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t"
+ "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t"
+ "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t"
+
+ : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
+ [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
+ [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
+ [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
+ : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+ [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
+ [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
+ [add_p6toq6] "r"(add_p6toq6));
+
+ *oq0 = res_oq0;
+ *oq1 = res_oq1;
+ *oq2 = res_oq2;
+ *oq3 = res_oq3;
+ *oq4 = res_oq4;
+ *oq5 = res_oq5;
+ *oq6 = res_oq6;
+}
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
new file mode 100644
index 0000000000..9af0b42360
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+#define STORE_F0() \
+ { \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s4]) \n\t" \
+ "sb %[q0_f0], 0(%[s4]) \n\t" \
+ "sb %[p0_f0], -1(%[s4]) \n\t" \
+ "sb %[p1_f0], -2(%[s4]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
+ [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+ [p1_f0] "+r"(p1_f0) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s3]) \n\t" \
+ "sb %[q0_f0], 0(%[s3]) \n\t" \
+ "sb %[p0_f0], -1(%[s3]) \n\t" \
+ "sb %[p1_f0], -2(%[s3]) \n\t" \
+ \
+ : [p1_f0] "+r"(p1_f0) \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \
+ [p0_f0] "r"(p0_f0)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+ [p1_f0] "+r"(p1_f0) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s2]) \n\t" \
+ "sb %[q0_f0], 0(%[s2]) \n\t" \
+ "sb %[p0_f0], -1(%[s2]) \n\t" \
+ "sb %[p1_f0], -2(%[s2]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
+ [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+ [p1_f0] "+r"(p1_f0) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q1_f0], 1(%[s1]) \n\t" \
+ "sb %[q0_f0], 0(%[s1]) \n\t" \
+ "sb %[p0_f0], -1(%[s1]) \n\t" \
+ "sb %[p1_f0], -2(%[s1]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
+ [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \
+ }
+
+#define STORE_F1() \
+ { \
+ __asm__ __volatile__( \
+ "sb %[q2_r], 2(%[s4]) \n\t" \
+ "sb %[q1_r], 1(%[s4]) \n\t" \
+ "sb %[q0_r], 0(%[s4]) \n\t" \
+ "sb %[p0_r], -1(%[s4]) \n\t" \
+ "sb %[p1_r], -2(%[s4]) \n\t" \
+ "sb %[p2_r], -3(%[s4]) \n\t" \
+ \
+ : \
+ : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \
+ [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q2_r], %[q2_r], 16 \n\t" \
+ "srl %[q1_r], %[q1_r], 16 \n\t" \
+ "srl %[q0_r], %[q0_r], 16 \n\t" \
+ "srl %[p0_r], %[p0_r], 16 \n\t" \
+ "srl %[p1_r], %[p1_r], 16 \n\t" \
+ "srl %[p2_r], %[p2_r], 16 \n\t" \
+ \
+ : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \
+ [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q2_r], 2(%[s3]) \n\t" \
+ "sb %[q1_r], 1(%[s3]) \n\t" \
+ "sb %[q0_r], 0(%[s3]) \n\t" \
+ "sb %[p0_r], -1(%[s3]) \n\t" \
+ "sb %[p1_r], -2(%[s3]) \n\t" \
+ "sb %[p2_r], -3(%[s3]) \n\t" \
+ \
+ : \
+ : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \
+ [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q2_l], 2(%[s2]) \n\t" \
+ "sb %[q1_l], 1(%[s2]) \n\t" \
+ "sb %[q0_l], 0(%[s2]) \n\t" \
+ "sb %[p0_l], -1(%[s2]) \n\t" \
+ "sb %[p1_l], -2(%[s2]) \n\t" \
+ "sb %[p2_l], -3(%[s2]) \n\t" \
+ \
+ : \
+ : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \
+ [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q2_l], %[q2_l], 16 \n\t" \
+ "srl %[q1_l], %[q1_l], 16 \n\t" \
+ "srl %[q0_l], %[q0_l], 16 \n\t" \
+ "srl %[p0_l], %[p0_l], 16 \n\t" \
+ "srl %[p1_l], %[p1_l], 16 \n\t" \
+ "srl %[p2_l], %[p2_l], 16 \n\t" \
+ \
+ : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \
+ [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q2_l], 2(%[s1]) \n\t" \
+ "sb %[q1_l], 1(%[s1]) \n\t" \
+ "sb %[q0_l], 0(%[s1]) \n\t" \
+ "sb %[p0_l], -1(%[s1]) \n\t" \
+ "sb %[p1_l], -2(%[s1]) \n\t" \
+ "sb %[p2_l], -3(%[s1]) \n\t" \
+ \
+ : \
+ : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \
+ [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \
+ }
+
+#define STORE_F2() \
+ { \
+ __asm__ __volatile__( \
+ "sb %[q6_r], 6(%[s4]) \n\t" \
+ "sb %[q5_r], 5(%[s4]) \n\t" \
+ "sb %[q4_r], 4(%[s4]) \n\t" \
+ "sb %[q3_r], 3(%[s4]) \n\t" \
+ "sb %[q2_r], 2(%[s4]) \n\t" \
+ "sb %[q1_r], 1(%[s4]) \n\t" \
+ "sb %[q0_r], 0(%[s4]) \n\t" \
+ "sb %[p0_r], -1(%[s4]) \n\t" \
+ "sb %[p1_r], -2(%[s4]) \n\t" \
+ "sb %[p2_r], -3(%[s4]) \n\t" \
+ "sb %[p3_r], -4(%[s4]) \n\t" \
+ "sb %[p4_r], -5(%[s4]) \n\t" \
+ "sb %[p5_r], -6(%[s4]) \n\t" \
+ "sb %[p6_r], -7(%[s4]) \n\t" \
+ \
+ : \
+ : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \
+ [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \
+ [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \
+ [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \
+ [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q6_r], %[q6_r], 16 \n\t" \
+ "srl %[q5_r], %[q5_r], 16 \n\t" \
+ "srl %[q4_r], %[q4_r], 16 \n\t" \
+ "srl %[q3_r], %[q3_r], 16 \n\t" \
+ "srl %[q2_r], %[q2_r], 16 \n\t" \
+ "srl %[q1_r], %[q1_r], 16 \n\t" \
+ "srl %[q0_r], %[q0_r], 16 \n\t" \
+ "srl %[p0_r], %[p0_r], 16 \n\t" \
+ "srl %[p1_r], %[p1_r], 16 \n\t" \
+ "srl %[p2_r], %[p2_r], 16 \n\t" \
+ "srl %[p3_r], %[p3_r], 16 \n\t" \
+ "srl %[p4_r], %[p4_r], 16 \n\t" \
+ "srl %[p5_r], %[p5_r], 16 \n\t" \
+ "srl %[p6_r], %[p6_r], 16 \n\t" \
+ \
+ : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \
+ [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \
+ [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \
+ [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \
+ [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q6_r], 6(%[s3]) \n\t" \
+ "sb %[q5_r], 5(%[s3]) \n\t" \
+ "sb %[q4_r], 4(%[s3]) \n\t" \
+ "sb %[q3_r], 3(%[s3]) \n\t" \
+ "sb %[q2_r], 2(%[s3]) \n\t" \
+ "sb %[q1_r], 1(%[s3]) \n\t" \
+ "sb %[q0_r], 0(%[s3]) \n\t" \
+ "sb %[p0_r], -1(%[s3]) \n\t" \
+ "sb %[p1_r], -2(%[s3]) \n\t" \
+ "sb %[p2_r], -3(%[s3]) \n\t" \
+ "sb %[p3_r], -4(%[s3]) \n\t" \
+ "sb %[p4_r], -5(%[s3]) \n\t" \
+ "sb %[p5_r], -6(%[s3]) \n\t" \
+ "sb %[p6_r], -7(%[s3]) \n\t" \
+ \
+ : \
+ : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \
+ [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \
+ [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \
+ [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \
+ [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q6_l], 6(%[s2]) \n\t" \
+ "sb %[q5_l], 5(%[s2]) \n\t" \
+ "sb %[q4_l], 4(%[s2]) \n\t" \
+ "sb %[q3_l], 3(%[s2]) \n\t" \
+ "sb %[q2_l], 2(%[s2]) \n\t" \
+ "sb %[q1_l], 1(%[s2]) \n\t" \
+ "sb %[q0_l], 0(%[s2]) \n\t" \
+ "sb %[p0_l], -1(%[s2]) \n\t" \
+ "sb %[p1_l], -2(%[s2]) \n\t" \
+ "sb %[p2_l], -3(%[s2]) \n\t" \
+ "sb %[p3_l], -4(%[s2]) \n\t" \
+ "sb %[p4_l], -5(%[s2]) \n\t" \
+ "sb %[p5_l], -6(%[s2]) \n\t" \
+ "sb %[p6_l], -7(%[s2]) \n\t" \
+ \
+ : \
+ : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \
+ [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \
+ [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \
+ [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \
+ [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \
+ \
+ __asm__ __volatile__( \
+ "srl %[q6_l], %[q6_l], 16 \n\t" \
+ "srl %[q5_l], %[q5_l], 16 \n\t" \
+ "srl %[q4_l], %[q4_l], 16 \n\t" \
+ "srl %[q3_l], %[q3_l], 16 \n\t" \
+ "srl %[q2_l], %[q2_l], 16 \n\t" \
+ "srl %[q1_l], %[q1_l], 16 \n\t" \
+ "srl %[q0_l], %[q0_l], 16 \n\t" \
+ "srl %[p0_l], %[p0_l], 16 \n\t" \
+ "srl %[p1_l], %[p1_l], 16 \n\t" \
+ "srl %[p2_l], %[p2_l], 16 \n\t" \
+ "srl %[p3_l], %[p3_l], 16 \n\t" \
+ "srl %[p4_l], %[p4_l], 16 \n\t" \
+ "srl %[p5_l], %[p5_l], 16 \n\t" \
+ "srl %[p6_l], %[p6_l], 16 \n\t" \
+ \
+ : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \
+ [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \
+ [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \
+ [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \
+ [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \
+ :); \
+ \
+ __asm__ __volatile__( \
+ "sb %[q6_l], 6(%[s1]) \n\t" \
+ "sb %[q5_l], 5(%[s1]) \n\t" \
+ "sb %[q4_l], 4(%[s1]) \n\t" \
+ "sb %[q3_l], 3(%[s1]) \n\t" \
+ "sb %[q2_l], 2(%[s1]) \n\t" \
+ "sb %[q1_l], 1(%[s1]) \n\t" \
+ "sb %[q0_l], 0(%[s1]) \n\t" \
+ "sb %[p0_l], -1(%[s1]) \n\t" \
+ "sb %[p1_l], -2(%[s1]) \n\t" \
+ "sb %[p2_l], -3(%[s1]) \n\t" \
+ "sb %[p3_l], -4(%[s1]) \n\t" \
+ "sb %[p4_l], -5(%[s1]) \n\t" \
+ "sb %[p5_l], -6(%[s1]) \n\t" \
+ "sb %[p6_l], -7(%[s1]) \n\t" \
+ \
+ : \
+ : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \
+ [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \
+ [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \
+ [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \
+ [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \
+ }
+
+#define PACK_LEFT_0TO3() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbl %[p3_l], %[p3] \n\t" \
+ "preceu.ph.qbl %[p2_l], %[p2] \n\t" \
+ "preceu.ph.qbl %[p1_l], %[p1] \n\t" \
+ "preceu.ph.qbl %[p0_l], %[p0] \n\t" \
+ "preceu.ph.qbl %[q0_l], %[q0] \n\t" \
+ "preceu.ph.qbl %[q1_l], %[q1] \n\t" \
+ "preceu.ph.qbl %[q2_l], %[q2] \n\t" \
+ "preceu.ph.qbl %[q3_l], %[q3] \n\t" \
+ \
+ : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \
+ [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \
+ [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \
+ : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \
+ [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \
+ }
+
+#define PACK_LEFT_4TO7() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbl %[p7_l], %[p7] \n\t" \
+ "preceu.ph.qbl %[p6_l], %[p6] \n\t" \
+ "preceu.ph.qbl %[p5_l], %[p5] \n\t" \
+ "preceu.ph.qbl %[p4_l], %[p4] \n\t" \
+ "preceu.ph.qbl %[q4_l], %[q4] \n\t" \
+ "preceu.ph.qbl %[q5_l], %[q5] \n\t" \
+ "preceu.ph.qbl %[q6_l], %[q6] \n\t" \
+ "preceu.ph.qbl %[q7_l], %[q7] \n\t" \
+ \
+ : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \
+ [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \
+ [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \
+ : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \
+ [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \
+ }
+
+#define PACK_RIGHT_0TO3() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbr %[p3_r], %[p3] \n\t" \
+ "preceu.ph.qbr %[p2_r], %[p2] \n\t" \
+ "preceu.ph.qbr %[p1_r], %[p1] \n\t" \
+ "preceu.ph.qbr %[p0_r], %[p0] \n\t" \
+ "preceu.ph.qbr %[q0_r], %[q0] \n\t" \
+ "preceu.ph.qbr %[q1_r], %[q1] \n\t" \
+ "preceu.ph.qbr %[q2_r], %[q2] \n\t" \
+ "preceu.ph.qbr %[q3_r], %[q3] \n\t" \
+ \
+ : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \
+ [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \
+ [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \
+ : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \
+ [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \
+ }
+
+#define PACK_RIGHT_4TO7() \
+ { \
+ __asm__ __volatile__( \
+ "preceu.ph.qbr %[p7_r], %[p7] \n\t" \
+ "preceu.ph.qbr %[p6_r], %[p6] \n\t" \
+ "preceu.ph.qbr %[p5_r], %[p5] \n\t" \
+ "preceu.ph.qbr %[p4_r], %[p4] \n\t" \
+ "preceu.ph.qbr %[q4_r], %[q4] \n\t" \
+ "preceu.ph.qbr %[q5_r], %[q5] \n\t" \
+ "preceu.ph.qbr %[q6_r], %[q6] \n\t" \
+ "preceu.ph.qbr %[q7_r], %[q7] \n\t" \
+ \
+ : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \
+ [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \
+ [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \
+ : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \
+ [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \
+ }
+
+#define COMBINE_LEFT_RIGHT_0TO2() \
+ { \
+ __asm__ __volatile__( \
+ "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \
+ "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \
+ "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \
+ "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \
+ "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \
+ "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \
+ \
+ : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \
+ [q1] "=&r"(q1), [q2] "=&r"(q2) \
+ : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \
+ [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \
+ [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \
+ [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \
+ }
+
+#define COMBINE_LEFT_RIGHT_3TO6() \
+ { \
+ __asm__ __volatile__( \
+ "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \
+ "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \
+ "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \
+ "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \
+ "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \
+ "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \
+ "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \
+ "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \
+ \
+ : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \
+ [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \
+ [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \
+ [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \
+ [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \
+ [q6_r] "r"(q6_r)); \
+ }
+
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
new file mode 100644
index 0000000000..24c492bea0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function */
+static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+ uint32_t p1, uint32_t p0, uint32_t p3,
+ uint32_t p2, uint32_t q0, uint32_t q1,
+ uint32_t q2, uint32_t q3,
+ uint32_t thresh, uint32_t *hev,
+ uint32_t *mask) {
+ uint32_t c, r, r3, r_k;
+ uint32_t s1, s2, s3;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t hev1;
+
+ __asm__ __volatile__(
+ /* mask |= (abs(p3 - p2) > limit) */
+ "subu_s.qb %[c], %[p3], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], $0, %[c] \n\t"
+
+ /* mask |= (abs(p2 - p1) > limit) */
+ "subu_s.qb %[c], %[p2], %[p1] \n\t"
+ "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(p1 - p0) > limit)
+ * hev |= (abs(p1 - p0) > thresh)
+ */
+ "subu_s.qb %[c], %[p1], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], $0, %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(q1 - q0) > limit)
+ * hev |= (abs(q1 - q0) > thresh)
+ */
+ "subu_s.qb %[c], %[q1], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], %[r3], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(q2 - q1) > limit) */
+ "subu_s.qb %[c], %[q2], %[q1] \n\t"
+ "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r3], %[r3], 24 \n\t"
+
+ /* mask |= (abs(q3 - q2) > limit) */
+ "subu_s.qb %[c], %[q3], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
+ : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+ [thresh] "r"(thresh));
+
+ __asm__ __volatile__(
+ /* abs(p0 - q0) */
+ "subu_s.qb %[c], %[p0], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
+ "wrdsp %[r3] \n\t"
+ "or %[s1], %[r_k], %[c] \n\t"
+
+ /* abs(p1 - q1) */
+ "subu_s.qb %[c], %[p1], %[q1] \n\t"
+ "addu_s.qb %[s3], %[s1], %[s1] \n\t"
+ "pick.qb %[hev1], %[ones], $0 \n\t"
+ "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
+ "or %[s2], %[r_k], %[c] \n\t"
+
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
+ "shrl.qb %[s2], %[s2], 1 \n\t"
+ "addu_s.qb %[s1], %[s2], %[s3] \n\t"
+ "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r], %[r], 24 \n\t"
+
+ "wrdsp %[r] \n\t"
+ "pick.qb %[s2], $0, %[ones] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+ [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+ [ones] "r"(ones), [flimit] "r"(flimit));
+
+ *hev = hev1;
+ *mask = s2;
+}
+
+static INLINE void filter_hev_mask_flatmask4_dspr2(
+ uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
+ uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
+ uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
+ uint32_t c, r, r3, r_k, r_flat;
+ uint32_t s1, s2, s3;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t flat_thresh = 0x01010101;
+ uint32_t hev1;
+ uint32_t flat1;
+
+ __asm__ __volatile__(
+ /* mask |= (abs(p3 - p2) > limit) */
+ "subu_s.qb %[c], %[p3], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], $0, %[c] \n\t"
+
+ /* mask |= (abs(p2 - p1) > limit) */
+ "subu_s.qb %[c], %[p2], %[p1] \n\t"
+ "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(p1 - p0) > limit)
+ * hev |= (abs(p1 - p0) > thresh)
+ * flat |= (abs(p1 - p0) > thresh)
+ */
+ "subu_s.qb %[c], %[p1], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], $0, %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], $0, %[c] \n\t"
+
+ /* mask |= (abs(q1 - q0) > limit)
+ * hev |= (abs(q1 - q0) > thresh)
+ * flat |= (abs(q1 - q0) > thresh)
+ */
+ "subu_s.qb %[c], %[q1], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], %[r3], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p0 - p2) > thresh) */
+ "subu_s.qb %[c], %[p0], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q0 - q2) > thresh) */
+ "subu_s.qb %[c], %[q0], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p3 - p0) > thresh) */
+ "subu_s.qb %[c], %[p3], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q3 - q0) > thresh) */
+ "subu_s.qb %[c], %[q3], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+ "sll %[r_flat], %[r_flat], 24 \n\t"
+ /* look at stall here */
+ "wrdsp %[r_flat] \n\t"
+ "pick.qb %[flat1], $0, %[ones] \n\t"
+
+ /* mask |= (abs(q2 - q1) > limit) */
+ "subu_s.qb %[c], %[q2], %[q1] \n\t"
+ "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r3], %[r3], 24 \n\t"
+
+ /* mask |= (abs(q3 - q2) > limit) */
+ "subu_s.qb %[c], %[q3], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
+ [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
+ : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+ [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
+
+ __asm__ __volatile__(
+ /* abs(p0 - q0) */
+ "subu_s.qb %[c], %[p0], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
+ "wrdsp %[r3] \n\t"
+ "or %[s1], %[r_k], %[c] \n\t"
+
+ /* abs(p1 - q1) */
+ "subu_s.qb %[c], %[p1], %[q1] \n\t"
+ "addu_s.qb %[s3], %[s1], %[s1] \n\t"
+ "pick.qb %[hev1], %[ones], $0 \n\t"
+ "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
+ "or %[s2], %[r_k], %[c] \n\t"
+
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
+ "shrl.qb %[s2], %[s2], 1 \n\t"
+ "addu_s.qb %[s1], %[s2], %[s3] \n\t"
+ "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r], %[r], 24 \n\t"
+
+ "wrdsp %[r] \n\t"
+ "pick.qb %[s2], $0, %[ones] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+ [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+ [ones] "r"(ones), [flimit] "r"(flimit));
+
+ *hev = hev1;
+ *mask = s2;
+ *flat = flat1;
+}
+
+static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
+ uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
+ uint32_t q3, uint32_t q4, uint32_t *flat2) {
+ uint32_t c, r, r_k, r_flat;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t flat_thresh = 0x01010101;
+ uint32_t flat1, flat3;
+
+ __asm__ __volatile__(
+ /* flat |= (abs(p4 - p0) > thresh) */
+ "subu_s.qb %[c], %[p4], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p4] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r], $0, %[c] \n\t"
+
+ /* flat |= (abs(q4 - q0) > thresh) */
+ "subu_s.qb %[c], %[q4], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q4] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r], %[r], 24 \n\t"
+ "wrdsp %[r] \n\t"
+ "pick.qb %[flat3], $0, %[ones] \n\t"
+
+ /* flat |= (abs(p1 - p0) > thresh) */
+ "subu_s.qb %[c], %[p1], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], $0, %[c] \n\t"
+
+ /* flat |= (abs(q1 - q0) > thresh) */
+ "subu_s.qb %[c], %[q1], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p0 - p2) > thresh) */
+ "subu_s.qb %[c], %[p0], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q0 - q2) > thresh) */
+ "subu_s.qb %[c], %[q0], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p3 - p0) > thresh) */
+ "subu_s.qb %[c], %[p3], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q3 - q0) > thresh) */
+ "subu_s.qb %[c], %[q3], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+ "sll %[r_flat], %[r_flat], 24 \n\t"
+ "wrdsp %[r_flat] \n\t"
+ "pick.qb %[flat1], $0, %[ones] \n\t"
+ /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
+ "and %[flat1], %[flat3], %[flat1] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
+ [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
+ : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
+ [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
+ [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
+
+ *flat2 = flat1;
+}
+#endif // #if HAVE_DSPR2
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c
new file mode 100644
index 0000000000..e42479257c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vpx_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint32_t mask;
+ uint32_t hev, flat;
+ uint8_t i;
+ uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
+ uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+ uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ /* prefetch data for store */
+ prefetch_store(s);
+
+ for (i = 0; i < 2; i++) {
+ sp3 = s - (pitch << 2);
+ sp2 = sp3 + pitch;
+ sp1 = sp2 + pitch;
+ sp0 = sp1 + pitch;
+ sq0 = s;
+ sq1 = s + pitch;
+ sq2 = sq1 + pitch;
+ sq3 = sq2 + pitch;
+
+ __asm__ __volatile__(
+ "lw %[p3], (%[sp3]) \n\t"
+ "lw %[p2], (%[sp2]) \n\t"
+ "lw %[p1], (%[sp1]) \n\t"
+ "lw %[p0], (%[sp0]) \n\t"
+ "lw %[q0], (%[sq0]) \n\t"
+ "lw %[q1], (%[sq1]) \n\t"
+ "lw %[q2], (%[sq2]) \n\t"
+ "lw %[q3], (%[sq3]) \n\t"
+
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
+ : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
+
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+ if ((flat == 0) && (mask != 0)) {
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ __asm__ __volatile__(
+ "sw %[p1_f0], (%[sp1]) \n\t"
+ "sw %[p0_f0], (%[sp0]) \n\t"
+ "sw %[q0_f0], (%[sq0]) \n\t"
+ "sw %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1));
+ } else if ((mask & flat) == 0xFFFFFFFF) {
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ COMBINE_LEFT_RIGHT_0TO2()
+
+ __asm__ __volatile__(
+ "sw %[p2], (%[sp2]) \n\t"
+ "sw %[p1], (%[sp1]) \n\t"
+ "sw %[p0], (%[sp0]) \n\t"
+ "sw %[q0], (%[sq0]) \n\t"
+ "sw %[q1], (%[sq1]) \n\t"
+ "sw %[q2], (%[sq2]) \n\t"
+
+ :
+ : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+ [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+ [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if ((flat != 0) && (mask != 0)) {
+ /* filtering */
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r], (%[sp2]) \n\t"
+ "sb %[p1_r], (%[sp1]) \n\t"
+ "sb %[p0_r], (%[sp0]) \n\t"
+ "sb %[q0_r], (%[sq0]) \n\t"
+ "sb %[q1_r], (%[sq1]) \n\t"
+ "sb %[q2_r], (%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], (%[sp1]) \n\t"
+ "sb %[p0_f0], (%[sp0]) \n\t"
+ "sb %[q0_f0], (%[sq0]) \n\t"
+ "sb %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r], +1(%[sp2]) \n\t"
+ "sb %[p1_r], +1(%[sp1]) \n\t"
+ "sb %[p0_r], +1(%[sp0]) \n\t"
+ "sb %[q0_r], +1(%[sq0]) \n\t"
+ "sb %[q1_r], +1(%[sq1]) \n\t"
+ "sb %[q2_r], +1(%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +1(%[sp1]) \n\t"
+ "sb %[p0_f0], +1(%[sp0]) \n\t"
+ "sb %[q0_f0], +1(%[sq0]) \n\t"
+ "sb %[q1_f0], +1(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+ [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+ [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], +2(%[sp2]) \n\t"
+ "sb %[p1_l], +2(%[sp1]) \n\t"
+ "sb %[p0_l], +2(%[sp0]) \n\t"
+ "sb %[q0_l], +2(%[sq0]) \n\t"
+ "sb %[q1_l], +2(%[sq1]) \n\t"
+ "sb %[q2_l], +2(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +2(%[sp1]) \n\t"
+ "sb %[p0_f0], +2(%[sp0]) \n\t"
+ "sb %[q0_f0], +2(%[sq0]) \n\t"
+ "sb %[q1_f0], +2(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], +3(%[sp2]) \n\t"
+ "sb %[p1_l], +3(%[sp1]) \n\t"
+ "sb %[p0_l], +3(%[sp0]) \n\t"
+ "sb %[q0_l], +3(%[sq0]) \n\t"
+ "sb %[q1_l], +3(%[sq1]) \n\t"
+ "sb %[q2_l], +3(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +3(%[sp1]) \n\t"
+ "sb %[p0_f0], +3(%[sp0]) \n\t"
+ "sb %[q0_f0], +3(%[sq0]) \n\t"
+ "sb %[q1_f0], +3(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+ }
+
+ s = s + 4;
+ }
+}
+
+void vpx_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint8_t i;
+ uint32_t mask, hev, flat;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+ uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ prefetch_store(s + pitch);
+
+ for (i = 0; i < 2; i++) {
+ s1 = s;
+ s2 = s + pitch;
+ s3 = s2 + pitch;
+ s4 = s3 + pitch;
+ s = s4 + pitch;
+
+ __asm__ __volatile__(
+ "lw %[p0], -4(%[s1]) \n\t"
+ "lw %[p1], -4(%[s2]) \n\t"
+ "lw %[p2], -4(%[s3]) \n\t"
+ "lw %[p3], -4(%[s4]) \n\t"
+ "lw %[q3], (%[s1]) \n\t"
+ "lw %[q2], (%[s2]) \n\t"
+ "lw %[q1], (%[s3]) \n\t"
+ "lw %[q0], (%[s4]) \n\t"
+
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+ /* transpose p3, p2, p1, p0
+ original (when loaded from memory)
+ register -4 -3 -2 -1
+ p0 p0_0 p0_1 p0_2 p0_3
+ p1 p1_0 p1_1 p1_2 p1_3
+ p2 p2_0 p2_1 p2_2 p2_3
+ p3 p3_0 p3_1 p3_2 p3_3
+
+ after transpose
+ register
+ p0 p3_3 p2_3 p1_3 p0_3
+ p1 p3_2 p2_2 p1_2 p0_2
+ p2 p3_1 p2_1 p1_1 p0_1
+ p3 p3_0 p2_0 p1_0 p0_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p2], %[p3] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p0], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p2], %[p3], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose q0, q1, q2, q3
+ original (when loaded from memory)
+ register +1 +2 +3 +4
+ q3 q3_0 q3_1 q3_2 q3_3
+ q2 q2_0 q2_1 q2_2 q2_3
+ q1 q1_0 q1_1 q1_2 q1_3
+ q0 q0_0 q0_1 q0_2 q0_3
+
+ after transpose
+ register
+ q3 q0_3 q1_3 q2_3 q3_3
+ q2 q0_2 q1_2 q2_2 q3_2
+ q1 q0_1 q1_1 q2_1 q3_1
+ q0 q0_0 q1_0 q2_0 q3_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
+ "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
+ "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
+ "precr.qb.ph %[prim4], %[q1], %[q0] \n\t"
+
+ "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[q3], %[q2], %[sec3] \n\t"
+ "precrq.ph.w %[q1], %[q0], %[sec4] \n\t"
+ "append %[q2], %[sec3], 16 \n\t"
+ "append %[q0], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+ [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+ if ((flat == 0) && (mask != 0)) {
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ STORE_F0()
+ } else if ((mask & flat) == 0xFFFFFFFF) {
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ STORE_F1()
+ } else if ((flat != 0) && (mask != 0)) {
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r], -3(%[s4]) \n\t"
+ "sb %[p1_r], -2(%[s4]) \n\t"
+ "sb %[p0_r], -1(%[s4]) \n\t"
+ "sb %[q0_r], (%[s4]) \n\t"
+ "sb %[q1_r], +1(%[s4]) \n\t"
+ "sb %[q2_r], +2(%[s4]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s4] "r"(s4));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s4]) \n\t"
+ "sb %[p0_f0], -1(%[s4]) \n\t"
+ "sb %[q0_f0], (%[s4]) \n\t"
+ "sb %[q1_f0], +1(%[s4]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r], -3(%[s3]) \n\t"
+ "sb %[p1_r], -2(%[s3]) \n\t"
+ "sb %[p0_r], -1(%[s3]) \n\t"
+ "sb %[q0_r], (%[s3]) \n\t"
+ "sb %[q1_r], +1(%[s3]) \n\t"
+ "sb %[q2_r], +2(%[s3]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s3] "r"(s3));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s3]) \n\t"
+ "sb %[p0_f0], -1(%[s3]) \n\t"
+ "sb %[q0_f0], (%[s3]) \n\t"
+ "sb %[q1_f0], +1(%[s3]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+ [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+ [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], -3(%[s2]) \n\t"
+ "sb %[p1_l], -2(%[s2]) \n\t"
+ "sb %[p0_l], -1(%[s2]) \n\t"
+ "sb %[q0_l], (%[s2]) \n\t"
+ "sb %[q1_l], +1(%[s2]) \n\t"
+ "sb %[q2_l], +2(%[s2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s2] "r"(s2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s2]) \n\t"
+ "sb %[p0_f0], -1(%[s2]) \n\t"
+ "sb %[q0_f0], (%[s2]) \n\t"
+ "sb %[q1_f0], +1(%[s2]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], -3(%[s1]) \n\t"
+ "sb %[p1_l], -2(%[s1]) \n\t"
+ "sb %[p0_l], -1(%[s1]) \n\t"
+ "sb %[q0_l], (%[s1]) \n\t"
+ "sb %[q1_l], +1(%[s1]) \n\t"
+ "sb %[q2_l], +2(%[s1]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s1] "r"(s1));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s1]) \n\t"
+ "sb %[p0_f0], -1(%[s1]) \n\t"
+ "sb %[q0_f0], (%[s1]) \n\t"
+ "sb %[q1_f0], +1(%[s1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+ }
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
new file mode 100644
index 0000000000..9c1f5143f2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int count) {
+ uint32_t mask;
+ uint32_t hev, flat, flat2;
+ uint8_t i;
+ uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
+ uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+ uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+ uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+ uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+ uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+ uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ /* prefetch data for store */
+ prefetch_store(s);
+
+ for (i = 0; i < (2 * count); i++) {
+ sp7 = s - (pitch << 3);
+ sp6 = sp7 + pitch;
+ sp5 = sp6 + pitch;
+ sp4 = sp5 + pitch;
+ sp3 = sp4 + pitch;
+ sp2 = sp3 + pitch;
+ sp1 = sp2 + pitch;
+ sp0 = sp1 + pitch;
+ sq0 = s;
+ sq1 = s + pitch;
+ sq2 = sq1 + pitch;
+ sq3 = sq2 + pitch;
+ sq4 = sq3 + pitch;
+ sq5 = sq4 + pitch;
+ sq6 = sq5 + pitch;
+ sq7 = sq6 + pitch;
+
+ __asm__ __volatile__(
+ "lw %[p7], (%[sp7]) \n\t"
+ "lw %[p6], (%[sp6]) \n\t"
+ "lw %[p5], (%[sp5]) \n\t"
+ "lw %[p4], (%[sp4]) \n\t"
+ "lw %[p3], (%[sp3]) \n\t"
+ "lw %[p2], (%[sp2]) \n\t"
+ "lw %[p1], (%[sp1]) \n\t"
+ "lw %[p0], (%[sp0]) \n\t"
+
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+ : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
+
+ __asm__ __volatile__(
+ "lw %[q0], (%[sq0]) \n\t"
+ "lw %[q1], (%[sq1]) \n\t"
+ "lw %[q2], (%[sq2]) \n\t"
+ "lw %[q3], (%[sq3]) \n\t"
+ "lw %[q4], (%[sq4]) \n\t"
+ "lw %[q5], (%[sq5]) \n\t"
+ "lw %[q6], (%[sq6]) \n\t"
+ "lw %[q7], (%[sq7]) \n\t"
+
+ : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+ [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+ : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
+ [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
+
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+ flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+ /* f0 */
+ if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+ ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ __asm__ __volatile__(
+ "sw %[p1_f0], (%[sp1]) \n\t"
+ "sw %[p0_f0], (%[sp0]) \n\t"
+ "sw %[q0_f0], (%[sq0]) \n\t"
+ "sw %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1));
+ } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+ (mask == 0xFFFFFFFF)) {
+ /* f2 */
+ PACK_LEFT_0TO3()
+ PACK_LEFT_4TO7()
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
+
+ PACK_RIGHT_0TO3()
+ PACK_RIGHT_4TO7()
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
+
+ COMBINE_LEFT_RIGHT_0TO2()
+ COMBINE_LEFT_RIGHT_3TO6()
+
+ __asm__ __volatile__(
+ "sw %[p6], (%[sp6]) \n\t"
+ "sw %[p5], (%[sp5]) \n\t"
+ "sw %[p4], (%[sp4]) \n\t"
+ "sw %[p3], (%[sp3]) \n\t"
+ "sw %[p2], (%[sp2]) \n\t"
+ "sw %[p1], (%[sp1]) \n\t"
+ "sw %[p0], (%[sp0]) \n\t"
+
+ :
+ : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
+ [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
+ "sw %[q6], (%[sq6]) \n\t"
+ "sw %[q5], (%[sq5]) \n\t"
+ "sw %[q4], (%[sq4]) \n\t"
+ "sw %[q3], (%[sq3]) \n\t"
+ "sw %[q2], (%[sq2]) \n\t"
+ "sw %[q1], (%[sq1]) \n\t"
+ "sw %[q0], (%[sq0]) \n\t"
+
+ :
+ : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+ [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
+ [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
+ [sq1] "r"(sq1), [sq0] "r"(sq0));
+ } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+ /* f1 */
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ COMBINE_LEFT_RIGHT_0TO2()
+
+ __asm__ __volatile__(
+ "sw %[p2], (%[sp2]) \n\t"
+ "sw %[p1], (%[sp1]) \n\t"
+ "sw %[p0], (%[sp0]) \n\t"
+ "sw %[q0], (%[sq0]) \n\t"
+ "sw %[q1], (%[sq1]) \n\t"
+ "sw %[q2], (%[sq2]) \n\t"
+
+ :
+ : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+ [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+ [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+ /* f0+f1 */
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r], (%[sp2]) \n\t"
+ "sb %[p1_r], (%[sp1]) \n\t"
+ "sb %[p0_r], (%[sp0]) \n\t"
+ "sb %[q0_r], (%[sq0]) \n\t"
+ "sb %[q1_r], (%[sq1]) \n\t"
+ "sb %[q2_r], (%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], (%[sp1]) \n\t"
+ "sb %[p0_f0], (%[sp0]) \n\t"
+ "sb %[q0_f0], (%[sq0]) \n\t"
+ "sb %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r], +1(%[sp2]) \n\t"
+ "sb %[p1_r], +1(%[sp1]) \n\t"
+ "sb %[p0_r], +1(%[sp0]) \n\t"
+ "sb %[q0_r], +1(%[sq0]) \n\t"
+ "sb %[q1_r], +1(%[sq1]) \n\t"
+ "sb %[q2_r], +1(%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +1(%[sp1]) \n\t"
+ "sb %[p0_f0], +1(%[sp0]) \n\t"
+ "sb %[q0_f0], +1(%[sq0]) \n\t"
+ "sb %[q1_f0], +1(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], +2(%[sp2]) \n\t"
+ "sb %[p1_l], +2(%[sp1]) \n\t"
+ "sb %[p0_l], +2(%[sp0]) \n\t"
+ "sb %[q0_l], +2(%[sq0]) \n\t"
+ "sb %[q1_l], +2(%[sq1]) \n\t"
+ "sb %[q2_l], +2(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +2(%[sp1]) \n\t"
+ "sb %[p0_f0], +2(%[sp0]) \n\t"
+ "sb %[q0_f0], +2(%[sq0]) \n\t"
+ "sb %[q1_f0], +2(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], +3(%[sp2]) \n\t"
+ "sb %[p1_l], +3(%[sp1]) \n\t"
+ "sb %[p0_l], +3(%[sp0]) \n\t"
+ "sb %[q0_l], +3(%[sq0]) \n\t"
+ "sb %[q1_l], +3(%[sq1]) \n\t"
+ "sb %[q2_l], +3(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+ [sq1] "r"(sq1), [sq2] "r"(sq2));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +3(%[sp1]) \n\t"
+ "sb %[p0_f0], +3(%[sp0]) \n\t"
+ "sb %[q0_f0], +3(%[sq0]) \n\t"
+ "sb %[q1_f0], +3(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+ } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+ /* f0 + f1 + f2 */
+ /* f0 function */
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* f1 function */
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+ &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+ &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+ /* f2 function */
+ PACK_LEFT_4TO7()
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
+
+ PACK_RIGHT_4TO7()
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
+
+ if (mask & flat & flat2 & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p6_r], (%[sp6]) \n\t"
+ "sb %[p5_r], (%[sp5]) \n\t"
+ "sb %[p4_r], (%[sp4]) \n\t"
+ "sb %[p3_r], (%[sp3]) \n\t"
+ "sb %[p2_r], (%[sp2]) \n\t"
+ "sb %[p1_r], (%[sp1]) \n\t"
+ "sb %[p0_r], (%[sp0]) \n\t"
+
+ :
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
+ [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
+ "sb %[q0_r], (%[sq0]) \n\t"
+ "sb %[q1_r], (%[sq1]) \n\t"
+ "sb %[q2_r], (%[sq2]) \n\t"
+ "sb %[q3_r], (%[sq3]) \n\t"
+ "sb %[q4_r], (%[sq4]) \n\t"
+ "sb %[q5_r], (%[sq5]) \n\t"
+ "sb %[q6_r], (%[sq6]) \n\t"
+
+ :
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+ [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+ } else if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r_f1], (%[sp2]) \n\t"
+ "sb %[p1_r_f1], (%[sp1]) \n\t"
+ "sb %[p0_r_f1], (%[sp0]) \n\t"
+ "sb %[q0_r_f1], (%[sq0]) \n\t"
+ "sb %[q1_r_f1], (%[sq1]) \n\t"
+ "sb %[q2_r_f1], (%[sq2]) \n\t"
+
+ :
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], (%[sp1]) \n\t"
+ "sb %[p0_f0], (%[sp0]) \n\t"
+ "sb %[q0_f0], (%[sq0]) \n\t"
+ "sb %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p6_r], %[p6_r], 16 \n\t"
+ "srl %[p5_r], %[p5_r], 16 \n\t"
+ "srl %[p4_r], %[p4_r], 16 \n\t"
+ "srl %[p3_r], %[p3_r], 16 \n\t"
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[q3_r], %[q3_r], 16 \n\t"
+ "srl %[q4_r], %[q4_r], 16 \n\t"
+ "srl %[q5_r], %[q5_r], 16 \n\t"
+ "srl %[q6_r], %[q6_r], 16 \n\t"
+
+ : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+ [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
+ [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
+ [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
+ :);
+
+ __asm__ __volatile__(
+ "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
+ "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
+ "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
+ "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t"
+ "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t"
+ "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+ [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+ [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p6_r], +1(%[sp6]) \n\t"
+ "sb %[p5_r], +1(%[sp5]) \n\t"
+ "sb %[p4_r], +1(%[sp4]) \n\t"
+ "sb %[p3_r], +1(%[sp3]) \n\t"
+ "sb %[p2_r], +1(%[sp2]) \n\t"
+ "sb %[p1_r], +1(%[sp1]) \n\t"
+ "sb %[p0_r], +1(%[sp0]) \n\t"
+
+ :
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+ [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
+ "sb %[q0_r], +1(%[sq0]) \n\t"
+ "sb %[q1_r], +1(%[sq1]) \n\t"
+ "sb %[q2_r], +1(%[sq2]) \n\t"
+ "sb %[q3_r], +1(%[sq3]) \n\t"
+ "sb %[q4_r], +1(%[sq4]) \n\t"
+ "sb %[q5_r], +1(%[sq5]) \n\t"
+ "sb %[q6_r], +1(%[sq6]) \n\t"
+
+ :
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+ [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+ } else if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r_f1], +1(%[sp2]) \n\t"
+ "sb %[p1_r_f1], +1(%[sp1]) \n\t"
+ "sb %[p0_r_f1], +1(%[sp0]) \n\t"
+ "sb %[q0_r_f1], +1(%[sq0]) \n\t"
+ "sb %[q1_r_f1], +1(%[sq1]) \n\t"
+ "sb %[q2_r_f1], +1(%[sq2]) \n\t"
+
+ :
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +1(%[sp1]) \n\t"
+ "sb %[p0_f0], +1(%[sp0]) \n\t"
+ "sb %[q0_f0], +1(%[sq0]) \n\t"
+ "sb %[q1_f0], +1(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p6_l], +2(%[sp6]) \n\t"
+ "sb %[p5_l], +2(%[sp5]) \n\t"
+ "sb %[p4_l], +2(%[sp4]) \n\t"
+ "sb %[p3_l], +2(%[sp3]) \n\t"
+ "sb %[p2_l], +2(%[sp2]) \n\t"
+ "sb %[p1_l], +2(%[sp1]) \n\t"
+ "sb %[p0_l], +2(%[sp0]) \n\t"
+
+ :
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+ [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
+ "sb %[q0_l], +2(%[sq0]) \n\t"
+ "sb %[q1_l], +2(%[sq1]) \n\t"
+ "sb %[q2_l], +2(%[sq2]) \n\t"
+ "sb %[q3_l], +2(%[sq3]) \n\t"
+ "sb %[q4_l], +2(%[sq4]) \n\t"
+ "sb %[q5_l], +2(%[sq5]) \n\t"
+ "sb %[q6_l], +2(%[sq6]) \n\t"
+
+ :
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+ [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+ } else if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l_f1], +2(%[sp2]) \n\t"
+ "sb %[p1_l_f1], +2(%[sp1]) \n\t"
+ "sb %[p0_l_f1], +2(%[sp0]) \n\t"
+ "sb %[q0_l_f1], +2(%[sq0]) \n\t"
+ "sb %[q1_l_f1], +2(%[sq1]) \n\t"
+ "sb %[q2_l_f1], +2(%[sq2]) \n\t"
+
+ :
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +2(%[sp1]) \n\t"
+ "sb %[p0_f0], +2(%[sp0]) \n\t"
+ "sb %[q0_f0], +2(%[sq0]) \n\t"
+ "sb %[q1_f0], +2(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p6_l], %[p6_l], 16 \n\t"
+ "srl %[p5_l], %[p5_l], 16 \n\t"
+ "srl %[p4_l], %[p4_l], 16 \n\t"
+ "srl %[p3_l], %[p3_l], 16 \n\t"
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[q3_l], %[q3_l], 16 \n\t"
+ "srl %[q4_l], %[q4_l], 16 \n\t"
+ "srl %[q5_l], %[q5_l], 16 \n\t"
+ "srl %[q6_l], %[q6_l], 16 \n\t"
+
+ : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+ [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+ [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+ [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+ :);
+
+ __asm__ __volatile__(
+ "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
+ "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
+ "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
+ "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t"
+ "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t"
+ "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+ [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+ [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p6_l], +3(%[sp6]) \n\t"
+ "sb %[p5_l], +3(%[sp5]) \n\t"
+ "sb %[p4_l], +3(%[sp4]) \n\t"
+ "sb %[p3_l], +3(%[sp3]) \n\t"
+ "sb %[p2_l], +3(%[sp2]) \n\t"
+ "sb %[p1_l], +3(%[sp1]) \n\t"
+ "sb %[p0_l], +3(%[sp0]) \n\t"
+
+ :
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+ [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+ __asm__ __volatile__(
+ "sb %[q0_l], +3(%[sq0]) \n\t"
+ "sb %[q1_l], +3(%[sq1]) \n\t"
+ "sb %[q2_l], +3(%[sq2]) \n\t"
+ "sb %[q3_l], +3(%[sq3]) \n\t"
+ "sb %[q4_l], +3(%[sq4]) \n\t"
+ "sb %[q5_l], +3(%[sq5]) \n\t"
+ "sb %[q6_l], +3(%[sq6]) \n\t"
+
+ :
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
+ [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
+ } else if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l_f1], +3(%[sp2]) \n\t"
+ "sb %[p1_l_f1], +3(%[sp1]) \n\t"
+ "sb %[p0_l_f1], +3(%[sp0]) \n\t"
+ "sb %[q0_l_f1], +3(%[sq0]) \n\t"
+ "sb %[q1_l_f1], +3(%[sq1]) \n\t"
+ "sb %[q2_l_f1], +3(%[sq2]) \n\t"
+
+ :
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+ [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+ [sq2] "r"(sq2));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], +3(%[sp1]) \n\t"
+ "sb %[p0_f0], +3(%[sp0]) \n\t"
+ "sb %[q0_f0], +3(%[sq0]) \n\t"
+ "sb %[q1_f0], +3(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+ [sq0] "r"(sq0), [sq1] "r"(sq1));
+ }
+ }
+
+ s = s + 4;
+ }
+}
+
+void vpx_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
+}
+
+void vpx_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);
+}
+#endif // #if HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
new file mode 100644
index 0000000000..96e8d8858a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
@@ -0,0 +1,756 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vpx_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8_t i;
+ uint32_t mask, hev, flat, flat2;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+ uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+ uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+ uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+ uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+ uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+ prefetch_store(s + pitch);
+
+ for (i = 0; i < 2; i++) {
+ s1 = s;
+ s2 = s + pitch;
+ s3 = s2 + pitch;
+ s4 = s3 + pitch;
+ s = s4 + pitch;
+
+ __asm__ __volatile__(
+ "lw %[p0], -4(%[s1]) \n\t"
+ "lw %[p1], -4(%[s2]) \n\t"
+ "lw %[p2], -4(%[s3]) \n\t"
+ "lw %[p3], -4(%[s4]) \n\t"
+ "lw %[p4], -8(%[s1]) \n\t"
+ "lw %[p5], -8(%[s2]) \n\t"
+ "lw %[p6], -8(%[s3]) \n\t"
+ "lw %[p7], -8(%[s4]) \n\t"
+
+ : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+ [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+ __asm__ __volatile__(
+ "lw %[q3], (%[s1]) \n\t"
+ "lw %[q2], (%[s2]) \n\t"
+ "lw %[q1], (%[s3]) \n\t"
+ "lw %[q0], (%[s4]) \n\t"
+ "lw %[q7], +4(%[s1]) \n\t"
+ "lw %[q6], +4(%[s2]) \n\t"
+ "lw %[q5], +4(%[s3]) \n\t"
+ "lw %[q4], +4(%[s4]) \n\t"
+
+ : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+ [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+ : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+ /* transpose p3, p2, p1, p0
+ original (when loaded from memory)
+ register -4 -3 -2 -1
+ p0 p0_0 p0_1 p0_2 p0_3
+ p1 p1_0 p1_1 p1_2 p1_3
+ p2 p2_0 p2_1 p2_2 p2_3
+ p3 p3_0 p3_1 p3_2 p3_3
+
+ after transpose
+ register
+ p0 p3_3 p2_3 p1_3 p0_3
+ p1 p3_2 p2_2 p1_2 p0_2
+ p2 p3_1 p2_1 p1_1 p0_1
+ p3 p3_0 p2_0 p1_0 p0_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p2], %[p3] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p0], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p2], %[p3], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose q0, q1, q2, q3
+ original (when loaded from memory)
+ register +1 +2 +3 +4
+ q3 q3_0 q3_1 q3_2 q3_3
+ q2 q2_0 q2_1 q2_2 q2_3
+ q1 q1_0 q1_1 q1_2 q1_3
+ q0 q0_0 q0_1 q0_2 q0_3
+
+ after transpose
+ register
+ q3 q0_3 q1_3 q2_3 q3_3
+ q2 q0_2 q1_2 q2_2 q3_2
+ q1 q0_1 q1_1 q2_1 q3_1
+ q0 q0_0 q1_0 q2_0 q3_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
+ "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
+ "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
+ "precr.qb.ph %[prim4], %[q1], %[q0] \n\t"
+
+ "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[q3], %[q2], %[sec3] \n\t"
+ "precrq.ph.w %[q1], %[q0], %[sec4] \n\t"
+ "append %[q2], %[sec3], 16 \n\t"
+ "append %[q0], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+ [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose p7, p6, p5, p4
+ original (when loaded from memory)
+ register -8 -7 -6 -5
+ p4 p4_0 p4_1 p4_2 p4_3
+ p5 p5_0 p5_1 p5_2 p5_3
+ p6 p6_0 p6_1 p6_2 p6_3
+ p7 p7_0 p7_1 p7_2 p7_3
+
+ after transpose
+ register
+ p4 p7_3 p6_3 p5_3 p4_3
+ p5 p7_2 p6_2 p5_2 p4_2
+ p6 p7_1 p6_1 p5_1 p4_1
+ p7 p7_0 p6_0 p5_0 p4_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p4], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t"
+ "precr.qb.ph %[prim4], %[p6], %[p7] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p4], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p6], %[p7], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p7], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
+ [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose q4, q5, q6, q7
+ original (when loaded from memory)
+ register +5 +6 +7 +8
+ q7 q7_0 q7_1 q7_2 q7_3
+ q6 q6_0 q6_1 q6_2 q6_3
+ q5 q5_0 q5_1 q5_2 q5_3
+ q4 q4_0 q4_1 q4_2 q4_3
+
+ after transpose
+ register
+ q7 q4_3 q5_3 q26_3 q7_3
+ q6 q4_2 q5_2 q26_2 q7_2
+ q5 q4_1 q5_1 q26_1 q7_1
+ q4 q4_0 q5_0 q26_0 q7_0
+ */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t"
+ "precr.qb.ph %[prim2], %[q7], %[q6] \n\t"
+ "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t"
+ "precr.qb.ph %[prim4], %[q5], %[q4] \n\t"
+
+ "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[q7], %[q6], %[sec3] \n\t"
+ "precrq.ph.w %[q5], %[q4], %[sec4] \n\t"
+ "append %[q6], %[sec3], 16 \n\t"
+ "append %[q4], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
+ [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+ p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+ flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+ /* f0 */
+ if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+ ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ STORE_F0()
+ } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+ (mask == 0xFFFFFFFF)) {
+ /* f2 */
+ PACK_LEFT_0TO3()
+ PACK_LEFT_4TO7()
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
+
+ PACK_RIGHT_0TO3()
+ PACK_RIGHT_4TO7()
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
+
+ STORE_F2()
+ } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+ /* f1 */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ STORE_F1()
+ } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+ /* f0 + f1 */
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r], -3(%[s4]) \n\t"
+ "sb %[p1_r], -2(%[s4]) \n\t"
+ "sb %[p0_r], -1(%[s4]) \n\t"
+ "sb %[q0_r], (%[s4]) \n\t"
+ "sb %[q1_r], +1(%[s4]) \n\t"
+ "sb %[q2_r], +2(%[s4]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s4] "r"(s4));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s4]) \n\t"
+ "sb %[p0_f0], -1(%[s4]) \n\t"
+ "sb %[q0_f0], (%[s4]) \n\t"
+ "sb %[q1_f0], +1(%[s4]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+ [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r], -3(%[s3]) \n\t"
+ "sb %[p1_r], -2(%[s3]) \n\t"
+ "sb %[p0_r], -1(%[s3]) \n\t"
+ "sb %[q0_r], (%[s3]) \n\t"
+ "sb %[q1_r], +1(%[s3]) \n\t"
+ "sb %[q2_r], +2(%[s3]) \n\t"
+
+ :
+ : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+ [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [s3] "r"(s3));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s3]) \n\t"
+ "sb %[p0_f0], -1(%[s3]) \n\t"
+ "sb %[q0_f0], (%[s3]) \n\t"
+ "sb %[q1_f0], +1(%[s3]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], -3(%[s2]) \n\t"
+ "sb %[p1_l], -2(%[s2]) \n\t"
+ "sb %[p0_l], -1(%[s2]) \n\t"
+ "sb %[q0_l], (%[s2]) \n\t"
+ "sb %[q1_l], +1(%[s2]) \n\t"
+ "sb %[q2_l], +2(%[s2]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s2] "r"(s2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s2]) \n\t"
+ "sb %[p0_f0], -1(%[s2]) \n\t"
+ "sb %[q0_f0], (%[s2]) \n\t"
+ "sb %[q1_f0], +1(%[s2]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+ [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l], -3(%[s1]) \n\t"
+ "sb %[p1_l], -2(%[s1]) \n\t"
+ "sb %[p0_l], -1(%[s1]) \n\t"
+ "sb %[q0_l], (%[s1]) \n\t"
+ "sb %[q1_l], +1(%[s1]) \n\t"
+ "sb %[q2_l], +2(%[s1]) \n\t"
+
+ :
+ : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+ [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [s1] "r"(s1));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s1]) \n\t"
+ "sb %[p0_f0], -1(%[s1]) \n\t"
+ "sb %[q0_f0], (%[s1]) \n\t"
+ "sb %[q1_f0], +1(%[s1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+ }
+ } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+ /* f0+f1+f2 */
+ filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ PACK_LEFT_0TO3()
+ mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+ &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+ PACK_RIGHT_0TO3()
+ mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+ &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+ PACK_LEFT_4TO7()
+ wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+ &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+ &q6_l, &q7_l);
+
+ PACK_RIGHT_4TO7()
+ wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+ &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+ &q6_r, &q7_r);
+
+ if (mask & flat & flat2 & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p6_r], -7(%[s4]) \n\t"
+ "sb %[p5_r], -6(%[s4]) \n\t"
+ "sb %[p4_r], -5(%[s4]) \n\t"
+ "sb %[p3_r], -4(%[s4]) \n\t"
+ "sb %[p2_r], -3(%[s4]) \n\t"
+ "sb %[p1_r], -2(%[s4]) \n\t"
+ "sb %[p0_r], -1(%[s4]) \n\t"
+
+ :
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [p0_r] "r"(p0_r), [s4] "r"(s4));
+
+ __asm__ __volatile__(
+ "sb %[q0_r], (%[s4]) \n\t"
+ "sb %[q1_r], +1(%[s4]) \n\t"
+ "sb %[q2_r], +2(%[s4]) \n\t"
+ "sb %[q3_r], +3(%[s4]) \n\t"
+ "sb %[q4_r], +4(%[s4]) \n\t"
+ "sb %[q5_r], +5(%[s4]) \n\t"
+ "sb %[q6_r], +6(%[s4]) \n\t"
+
+ :
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [s4] "r"(s4));
+ } else if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p2_r_f1], -3(%[s4]) \n\t"
+ "sb %[p1_r_f1], -2(%[s4]) \n\t"
+ "sb %[p0_r_f1], -1(%[s4]) \n\t"
+ "sb %[q0_r_f1], (%[s4]) \n\t"
+ "sb %[q1_r_f1], +1(%[s4]) \n\t"
+ "sb %[q2_r_f1], +2(%[s4]) \n\t"
+
+ :
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s4]) \n\t"
+ "sb %[p0_f0], -1(%[s4]) \n\t"
+ "sb %[q0_f0], (%[s4]) \n\t"
+ "sb %[q1_f0], +1(%[s4]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p6_r], %[p6_r], 16 \n\t"
+ "srl %[p5_r], %[p5_r], 16 \n\t"
+ "srl %[p4_r], %[p4_r], 16 \n\t"
+ "srl %[p3_r], %[p3_r], 16 \n\t"
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[q3_r], %[q3_r], 16 \n\t"
+ "srl %[q4_r], %[q4_r], 16 \n\t"
+ "srl %[q5_r], %[q5_r], 16 \n\t"
+ "srl %[q6_r], %[q6_r], 16 \n\t"
+
+ : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+ [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+ [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
+ [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
+ [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
+ :);
+
+ __asm__ __volatile__(
+ "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
+ "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
+ "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
+ "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t"
+ "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t"
+ "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+ [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+ [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p6_r], -7(%[s3]) \n\t"
+ "sb %[p5_r], -6(%[s3]) \n\t"
+ "sb %[p4_r], -5(%[s3]) \n\t"
+ "sb %[p3_r], -4(%[s3]) \n\t"
+ "sb %[p2_r], -3(%[s3]) \n\t"
+ "sb %[p1_r], -2(%[s3]) \n\t"
+ "sb %[p0_r], -1(%[s3]) \n\t"
+
+ :
+ : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+ [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+ [p0_r] "r"(p0_r), [s3] "r"(s3));
+
+ __asm__ __volatile__(
+ "sb %[q0_r], (%[s3]) \n\t"
+ "sb %[q1_r], +1(%[s3]) \n\t"
+ "sb %[q2_r], +2(%[s3]) \n\t"
+ "sb %[q3_r], +3(%[s3]) \n\t"
+ "sb %[q4_r], +4(%[s3]) \n\t"
+ "sb %[q5_r], +5(%[s3]) \n\t"
+ "sb %[q6_r], +6(%[s3]) \n\t"
+
+ :
+ : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+ [q6_r] "r"(q6_r), [s3] "r"(s3));
+ } else if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p2_r_f1], -3(%[s3]) \n\t"
+ "sb %[p1_r_f1], -2(%[s3]) \n\t"
+ "sb %[p0_r_f1], -1(%[s3]) \n\t"
+ "sb %[q0_r_f1], (%[s3]) \n\t"
+ "sb %[q1_r_f1], +1(%[s3]) \n\t"
+ "sb %[q2_r_f1], +2(%[s3]) \n\t"
+
+ :
+ : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+ [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+ [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s3]) \n\t"
+ "sb %[p0_f0], -1(%[s3]) \n\t"
+ "sb %[q0_f0], (%[s3]) \n\t"
+ "sb %[q1_f0], +1(%[s3]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p6_l], -7(%[s2]) \n\t"
+ "sb %[p5_l], -6(%[s2]) \n\t"
+ "sb %[p4_l], -5(%[s2]) \n\t"
+ "sb %[p3_l], -4(%[s2]) \n\t"
+ "sb %[p2_l], -3(%[s2]) \n\t"
+ "sb %[p1_l], -2(%[s2]) \n\t"
+ "sb %[p0_l], -1(%[s2]) \n\t"
+
+ :
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [s2] "r"(s2));
+
+ __asm__ __volatile__(
+ "sb %[q0_l], (%[s2]) \n\t"
+ "sb %[q1_l], +1(%[s2]) \n\t"
+ "sb %[q2_l], +2(%[s2]) \n\t"
+ "sb %[q3_l], +3(%[s2]) \n\t"
+ "sb %[q4_l], +4(%[s2]) \n\t"
+ "sb %[q5_l], +5(%[s2]) \n\t"
+ "sb %[q6_l], +6(%[s2]) \n\t"
+
+ :
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [q6_l] "r"(q6_l), [s2] "r"(s2));
+ } else if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p2_l_f1], -3(%[s2]) \n\t"
+ "sb %[p1_l_f1], -2(%[s2]) \n\t"
+ "sb %[p0_l_f1], -1(%[s2]) \n\t"
+ "sb %[q0_l_f1], (%[s2]) \n\t"
+ "sb %[q1_l_f1], +1(%[s2]) \n\t"
+ "sb %[q2_l_f1], +2(%[s2]) \n\t"
+
+ :
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s2]) \n\t"
+ "sb %[p0_f0], -1(%[s2]) \n\t"
+ "sb %[q0_f0], (%[s2]) \n\t"
+ "sb %[q1_f0], +1(%[s2]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+ }
+
+ __asm__ __volatile__(
+ "srl %[p6_l], %[p6_l], 16 \n\t"
+ "srl %[p5_l], %[p5_l], 16 \n\t"
+ "srl %[p4_l], %[p4_l], 16 \n\t"
+ "srl %[p3_l], %[p3_l], 16 \n\t"
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[q3_l], %[q3_l], 16 \n\t"
+ "srl %[q4_l], %[q4_l], 16 \n\t"
+ "srl %[q5_l], %[q5_l], 16 \n\t"
+ "srl %[q6_l], %[q6_l], 16 \n\t"
+
+ : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+ [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+ [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+ [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+ [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+ :);
+
+ __asm__ __volatile__(
+ "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
+ "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
+ "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
+ "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t"
+ "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t"
+ "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+ [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+ [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+ [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+ [q1_f0] "+r"(q1_f0)
+ :);
+
+ if (mask & flat & flat2 & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p6_l], -7(%[s1]) \n\t"
+ "sb %[p5_l], -6(%[s1]) \n\t"
+ "sb %[p4_l], -5(%[s1]) \n\t"
+ "sb %[p3_l], -4(%[s1]) \n\t"
+ "sb %[p2_l], -3(%[s1]) \n\t"
+ "sb %[p1_l], -2(%[s1]) \n\t"
+ "sb %[p0_l], -1(%[s1]) \n\t"
+
+ :
+ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+ [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+ [p0_l] "r"(p0_l), [s1] "r"(s1));
+
+ __asm__ __volatile__(
+ "sb %[q0_l], (%[s1]) \n\t"
+ "sb %[q1_l], 1(%[s1]) \n\t"
+ "sb %[q2_l], 2(%[s1]) \n\t"
+ "sb %[q3_l], 3(%[s1]) \n\t"
+ "sb %[q4_l], 4(%[s1]) \n\t"
+ "sb %[q5_l], 5(%[s1]) \n\t"
+ "sb %[q6_l], 6(%[s1]) \n\t"
+
+ :
+ : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+ [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+ [q6_l] "r"(q6_l), [s1] "r"(s1));
+ } else if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p2_l_f1], -3(%[s1]) \n\t"
+ "sb %[p1_l_f1], -2(%[s1]) \n\t"
+ "sb %[p0_l_f1], -1(%[s1]) \n\t"
+ "sb %[q0_l_f1], (%[s1]) \n\t"
+ "sb %[q1_l_f1], +1(%[s1]) \n\t"
+ "sb %[q2_l_f1], +2(%[s1]) \n\t"
+
+ :
+ : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+ [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+ [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__(
+ "sb %[p1_f0], -2(%[s1]) \n\t"
+ "sb %[p0_f0], -1(%[s1]) \n\t"
+ "sb %[q0_f0], (%[s1]) \n\t"
+ "sb %[q1_f0], +1(%[s1]) \n\t"
+
+ :
+ : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+ [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+ }
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h
new file mode 100644
index 0000000000..1ea05e0b0b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \
+ p0_out, q0_out, q1_out) \
+ { \
+ v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+ const v16i8 cnst4b = __msa_ldi_b(4); \
+ const v16i8 cnst3b = __msa_ldi_b(3); \
+ \
+ p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
+ p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
+ q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
+ q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
+ \
+ filt = __msa_subs_s_b(p1_m, q1_m); \
+ filt &= hev; \
+ q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt &= mask; \
+ t1 = __msa_adds_s_b(filt, cnst4b); \
+ t1 >>= cnst3b; \
+ t2 = __msa_adds_s_b(filt, cnst3b); \
+ t2 >>= cnst3b; \
+ q0_m = __msa_subs_s_b(q0_m, t1); \
+ q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
+ p0_m = __msa_adds_s_b(p0_m, t2); \
+ p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
+ filt = __msa_srari_b(t1, 1); \
+ hev = __msa_xori_b(hev, 0xff); \
+ filt &= hev; \
+ q1_m = __msa_subs_s_b(q1_m, filt); \
+ q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
+ p1_m = __msa_adds_s_b(p1_m, filt); \
+ p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
+ }
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
+ { \
+ v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
+ v16u8 zero_in = { 0 }; \
+ \
+ tmp_flat4 = __msa_ori_b(zero_in, 1); \
+ p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
+ q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
+ p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
+ q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
+ \
+ p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
+ flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
+ p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
+ flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
+ \
+ flat_out = (tmp_flat4 < (v16u8)flat_out); \
+ flat_out = __msa_xori_b(flat_out, 0xff); \
+ flat_out = flat_out & (mask); \
+ }
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
+ q6_in, q7_in, flat_in, flat2_out) \
+ { \
+ v16u8 tmp_flat5, zero_in = { 0 }; \
+ v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
+ v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
+ \
+ tmp_flat5 = __msa_ori_b(zero_in, 1); \
+ p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
+ q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
+ p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
+ q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
+ p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
+ q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
+ p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
+ q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
+ \
+ p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
+ p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
+ p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
+ flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
+ \
+ flat2_out = (tmp_flat5 < (v16u8)flat2_out); \
+ flat2_out = __msa_xori_b(flat2_out, 0xff); \
+ flat2_out = flat2_out & flat_in; \
+ }
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+ q1_filt8_out, q2_filt8_out) \
+ { \
+ v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \
+ \
+ tmp_filt8_2 = p2_in + p1_in + p0_in; \
+ tmp_filt8_0 = p3_in << 1; \
+ \
+ tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \
+ tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \
+ p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \
+ p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = q2_in + q1_in + q0_in; \
+ tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \
+ tmp_filt8_0 = tmp_filt8_2 + (p0_in); \
+ tmp_filt8_0 = tmp_filt8_0 + (p3_in); \
+ p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \
+ \
+ tmp_filt8_0 = q2_in + q3_in; \
+ tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \
+ tmp_filt8_1 = q3_in + q3_in; \
+ tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \
+ q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
+ \
+ tmp_filt8_0 = tmp_filt8_2 + q3_in; \
+ tmp_filt8_1 = tmp_filt8_0 + q0_in; \
+ q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = tmp_filt8_0 - p2_in; \
+ tmp_filt8_0 = q1_in + q3_in; \
+ tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \
+ q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
+ }
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \
+ flat_out) \
+ { \
+ v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
+ v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
+ \
+ /* absolute subtraction of pixel values */ \
+ p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
+ p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
+ p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
+ q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
+ q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
+ q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
+ p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
+ p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
+ \
+ /* calculation of hev */ \
+ flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
+ hev_out = thresh_in < (v16u8)flat_out; \
+ \
+ /* calculation of mask */ \
+ p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
+ p1_asub_q1_m >>= 1; \
+ p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
+ \
+ mask_out = b_limit_in < p0_asub_q0_m; \
+ mask_out = __msa_max_u_b(flat_out, mask_out); \
+ p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
+ mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
+ q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
+ mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
+ \
+ mask_out = limit_in < (v16u8)mask_out; \
+ mask_out = __msa_xori_b(mask_out, 0xff); \
+ }
+#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h
new file mode 100644
index 0000000000..53462b59f4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h
@@ -0,0 +1,1971 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_MACROS_MSA_H_
+#define VPX_VPX_DSP_MIPS_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
+#define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
+#define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
+
+#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
+#define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
+#define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
+
+#if (__mips_isa_rev >= 6)
+#define LH(psrc) \
+ ({ \
+ uint16_t val_lh_m = *(const uint16_t *)(psrc); \
+ val_lh_m; \
+ })
+
+#define LW(psrc) \
+ ({ \
+ uint32_t val_lw_m = *(const uint32_t *)(psrc); \
+ val_lw_m; \
+ })
+
+#if (__mips == 64)
+#define LD(psrc) \
+ ({ \
+ uint64_t val_ld_m = *(const uint64_t *)(psrc); \
+ val_ld_m; \
+ })
+#else // !(__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+ uint32_t val0_ld_m, val1_ld_m; \
+ uint64_t val_ld_m = 0; \
+ \
+ val0_ld_m = LW(psrc_ld_m); \
+ val1_ld_m = LW(psrc_ld_m + 4); \
+ \
+ val_ld_m = (uint64_t)(val1_ld_m); \
+ val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
+ val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \
+ \
+ val_ld_m; \
+ })
+#endif // (__mips == 64)
+
+#define SH(val, pdst) *(uint16_t *)(pdst) = (val);
+#define SW(val, pdst) *(uint32_t *)(pdst) = (val);
+#define SD(val, pdst) *(uint64_t *)(pdst) = (val);
+#else // !(__mips_isa_rev >= 6)
+#define LH(psrc) \
+ ({ \
+ const uint8_t *psrc_lh_m = (const uint8_t *)(psrc); \
+ uint16_t val_lh_m; \
+ \
+ __asm__ __volatile__("ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
+ \
+ : [val_lh_m] "=r"(val_lh_m) \
+ : [psrc_lh_m] "m"(*psrc_lh_m)); \
+ \
+ val_lh_m; \
+ })
+
+#define LW(psrc) \
+ ({ \
+ const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
+ uint32_t val_lw_m; \
+ \
+ __asm__ __volatile__( \
+ "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
+ "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
+ : [val_lw_m] "=&r"(val_lw_m) \
+ : [psrc_lw_m] "r"(psrc_lw_m)); \
+ \
+ val_lw_m; \
+ })
+
+#if (__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+ uint64_t val_ld_m = 0; \
+ \
+ __asm__ __volatile__( \
+ "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
+ "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
+ : [val_ld_m] "=&r"(val_ld_m) \
+ : [psrc_ld_m] "r"(psrc_ld_m)); \
+ \
+ val_ld_m; \
+ })
+#else // !(__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+ uint32_t val0_ld_m, val1_ld_m; \
+ uint64_t val_ld_m = 0; \
+ \
+ val0_ld_m = LW(psrc_ld_m); \
+ val1_ld_m = LW(psrc_ld_m + 4); \
+ \
+ val_ld_m = (uint64_t)(val1_ld_m); \
+ val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
+ val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \
+ \
+ val_ld_m; \
+ })
+#endif // (__mips == 64)
+
+#define SH(val, pdst) \
+ { \
+ uint8_t *pdst_sh_m = (uint8_t *)(pdst); \
+ const uint16_t val_sh_m = (val); \
+ \
+ __asm__ __volatile__("ush %[val_sh_m], %[pdst_sh_m] \n\t" \
+ \
+ : [pdst_sh_m] "=m"(*pdst_sh_m) \
+ : [val_sh_m] "r"(val_sh_m)); \
+ }
+
+#define SW(val, pdst) \
+ { \
+ uint8_t *pdst_sw_m = (uint8_t *)(pdst); \
+ const uint32_t val_sw_m = (val); \
+ \
+ __asm__ __volatile__("usw %[val_sw_m], %[pdst_sw_m] \n\t" \
+ \
+ : [pdst_sw_m] "=m"(*pdst_sw_m) \
+ : [val_sw_m] "r"(val_sw_m)); \
+ }
+
+#define SD(val, pdst) \
+ { \
+ uint8_t *pdst_sd_m = (uint8_t *)(pdst); \
+ uint32_t val0_sd_m, val1_sd_m; \
+ \
+ val0_sd_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
+ val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ \
+ SW(val0_sd_m, pdst_sd_m); \
+ SW(val1_sd_m, pdst_sd_m + 4); \
+ }
+#endif // (__mips_isa_rev >= 6)
+
+/* Description : Load 4 words with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1, out2, out3
+ Details : Load word in 'out0' from (psrc)
+ Load word in 'out1' from (psrc + stride)
+ Load word in 'out2' from (psrc + 2 * stride)
+ Load word in 'out3' from (psrc + 3 * stride)
+*/
+#define LW4(psrc, stride, out0, out1, out2, out3) \
+ { \
+ out0 = LW((psrc)); \
+ out1 = LW((psrc) + stride); \
+ out2 = LW((psrc) + 2 * stride); \
+ out3 = LW((psrc) + 3 * stride); \
+ }
+
+/* Description : Load double words with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Details : Load double word in 'out0' from (psrc)
+ Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) \
+ { \
+ out0 = LD((psrc)); \
+ out1 = LD((psrc) + stride); \
+ }
+#define LD4(psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD2((psrc), stride, out0, out1); \
+ LD2((psrc) + 2 * stride, stride, out2, out3); \
+ }
+
+/* Description : Store 4 words with stride
+ Arguments : Inputs - in0, in1, in2, in3, pdst, stride
+ Details : Store word from 'in0' to (pdst)
+ Store word from 'in1' to (pdst + stride)
+ Store word from 'in2' to (pdst + 2 * stride)
+ Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride) \
+ { \
+ SW(in0, (pdst)) \
+ SW(in1, (pdst) + stride); \
+ SW(in2, (pdst) + 2 * stride); \
+ SW(in3, (pdst) + 3 * stride); \
+ }
+
+/* Description : Store 4 double words with stride
+ Arguments : Inputs - in0, in1, in2, in3, pdst, stride
+ Details : Store double word from 'in0' to (pdst)
+ Store double word from 'in1' to (pdst + stride)
+ Store double word from 'in2' to (pdst + 2 * stride)
+ Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride) \
+ { \
+ SD(in0, (pdst)) \
+ SD(in1, (pdst) + stride); \
+ SD(in2, (pdst) + 2 * stride); \
+ SD(in3, (pdst) + 3 * stride); \
+ }
+
+/* Description : Load vector elements with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Load 16 byte elements in 'out0' from (psrc)
+ Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_V2(RTYPE, psrc, stride, out0, out1) \
+ { \
+ out0 = LD_V(RTYPE, (psrc)); \
+ out1 = LD_V(RTYPE, (psrc) + stride); \
+ }
+#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
+#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
+#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
+
+#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
+ { \
+ LD_V2(RTYPE, (psrc), stride, out0, out1); \
+ out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
+ }
+#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
+
+#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD_V2(RTYPE, (psrc), stride, out0, out1); \
+ LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+ }
+#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
+#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
+
+#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
+ { \
+ LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
+ }
+#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
+
+#define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
+ { \
+ LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
+ LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
+ }
+#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
+
+#define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7) \
+ { \
+ LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
+ }
+#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
+#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
+
+#define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7, out8, out9, out10, out11, out12, out13, out14, out15) \
+ { \
+ LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7); \
+ LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
+ out13, out14, out15); \
+ }
+#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
+
+/* Description : Load 4x4 block of signed halfword elements from 1D source
+ data into 4 vectors (Each vector with 4 signed halfwords)
+ Arguments : Input - psrc
+ Outputs - out0, out1, out2, out3
+*/
+#define LD4x4_SH(psrc, out0, out1, out2, out3) \
+ { \
+ out0 = LD_SH(psrc); \
+ out2 = LD_SH(psrc + 8); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
+ }
+
+/* Description : Store vectors with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 16 byte elements from 'in0' to (pdst)
+ Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_V2(RTYPE, in0, in1, pdst, stride) \
+ { \
+ ST_V(RTYPE, in0, (pdst)); \
+ ST_V(RTYPE, in1, (pdst) + stride); \
+ }
+#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
+#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
+#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
+
+#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
+ { \
+ ST_V2(RTYPE, in0, in1, (pdst), stride); \
+ ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+ }
+#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
+#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
+
+#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+ { \
+ ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride); \
+ ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
+ }
+#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
+#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+ Arguments : Inputs - in, stidx, pdst, stride
+ Details : Index 'stidx' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst)
+ Index 'stidx+1' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + stride)
+ Index 'stidx+2' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + 2 * stride)
+ Index 'stidx+3' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride) \
+ { \
+ uint16_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \
+ out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
+ out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
+ out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
+ \
+ SH(out0_m, pblk_2x4_m); \
+ SH(out1_m, pblk_2x4_m + stride); \
+ SH(out2_m, pblk_2x4_m + 2 * stride); \
+ SH(out3_m, pblk_2x4_m + 3 * stride); \
+ }
+
+/* Description : Store 4x2 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst, stride
+ Details : Index 0 word element from 'in' vector is copied to the GP
+ register and stored to (pdst)
+ Index 1 word element from 'in' vector is copied to the GP
+ register and stored to (pdst + stride)
+*/
+#define ST4x2_UB(in, pdst, stride) \
+ { \
+ uint32_t out0_m, out1_m; \
+ uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_w((v4i32)in, 0); \
+ out1_m = __msa_copy_u_w((v4i32)in, 1); \
+ \
+ SW(out0_m, pblk_4x2_m); \
+ SW(out1_m, pblk_4x2_m + stride); \
+ }
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : 'Idx0' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst)
+ 'Idx1' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + stride)
+ 'Idx2' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + 2 * stride)
+ 'Idx3' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
+ { \
+ uint32_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_w((v4i32)in0, idx0); \
+ out1_m = __msa_copy_u_w((v4i32)in0, idx1); \
+ out2_m = __msa_copy_u_w((v4i32)in1, idx2); \
+ out3_m = __msa_copy_u_w((v4i32)in1, idx3); \
+ \
+ SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
+ }
+#define ST4x8_UB(in0, in1, pdst, stride) \
+ { \
+ uint8_t *pblk_4x8 = (uint8_t *)(pdst); \
+ \
+ ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
+ ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
+ }
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst
+ Details : Index 0 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst)
+*/
+#define ST8x1_UB(in, pdst) \
+ { \
+ uint64_t out0_m; \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in, 0); \
+ SD(out0_m, pdst); \
+ }
+
+/* Description : Store 8x2 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst, stride
+ Details : Index 0 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst)
+ Index 1 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride) \
+ { \
+ uint64_t out0_m, out1_m; \
+ uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in, 1); \
+ \
+ SD(out0_m, pblk_8x2_m); \
+ SD(out1_m, pblk_8x2_m + stride); \
+ }
+
+/* Description : Store 8x4 byte block to destination memory from input
+ vectors
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Index 0 double word element from 'in0' vector is copied to the
+ GP register and stored to (pdst)
+ Index 1 double word element from 'in0' vector is copied to the
+ GP register and stored to (pdst + stride)
+ Index 0 double word element from 'in1' vector is copied to the
+ GP register and stored to (pdst + 2 * stride)
+ Index 1 double word element from 'in1' vector is copied to the
+ GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride) \
+ { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in0, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in0, 1); \
+ out2_m = __msa_copy_u_d((v2i64)in1, 0); \
+ out3_m = __msa_copy_u_d((v2i64)in1, 1); \
+ \
+ SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
+ }
+
+/* Description : average with rounding (in0 + in1 + 1) / 2.
+ Arguments : Inputs - in0, in1, in2, in3,
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each unsigned byte element from 'in0' vector is added with
+ each unsigned byte element from 'in1' vector. Then the average
+ with rounding is calculated and written to 'out0'
+*/
+#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
+ out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
+ }
+#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
+
+#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
+ }
+#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide with zero
+ Arguments : Inputs - in0, in1, slide_val
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'zero_m' vector are slid into 'in0' by
+ value specified in the 'slide_val'
+*/
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
+ { \
+ v16i8 zero_m = { 0 }; \
+ out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
+ out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
+ }
+#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
+
+#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
+ slide_val) \
+ { \
+ SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
+ SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
+ }
+#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide
+ Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by
+ value specified in the 'slide_val'
+*/
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
+ { \
+ out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \
+ out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
+ }
+#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
+#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
+
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
+ out2, slide_val) \
+ { \
+ SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
+ out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \
+ }
+#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+ Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'in0' & 'in1' are copied selectively to
+ 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+ }
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
+
+#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
+ out3) \
+ { \
+ VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
+ VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
+ }
+#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
+#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Unsigned byte elements from 'mult0' are multiplied with
+ unsigned byte elements from 'cnst0' producing a result
+ twice the size of input i.e. unsigned halfword.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \
+ out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \
+ }
+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
+
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed byte elements from 'mult0' are multiplied with
+ signed byte elements from 'cnst0' producing a result
+ twice the size of input i.e. signed halfword.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
+ }
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed halfword elements from 'mult0' are multiplied with
+ signed halfword elements from 'cnst0' producing a result
+ twice the size of input i.e. signed word.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
+ }
+#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
+
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of word vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed word elements from 'mult0' are multiplied with
+ signed word elements from 'cnst0' producing a result
+ twice the size of input i.e. signed double word.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
+ }
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
+/* Description : Dot product & addition of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed byte elements from 'mult0' are multiplied with
+ signed byte elements from 'cnst0' producing a result
+ twice the size of input i.e. signed halfword.
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
+ out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
+ }
+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
+
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product & addition of halfword vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed halfword elements from 'mult0' are multiplied with
+ signed halfword elements from 'cnst0' producing a result
+ twice the size of input i.e. signed word.
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
+ out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
+ }
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
+/* Description : Dot product & addition of double word vector elements
+ Arguments : Inputs - mult0, mult1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each signed word element from 'mult0' is multiplied with itself
+ producing an intermediate result twice the size of input
+ i.e. signed double word
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
+ out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
+ }
+#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
+
+/* Description : Minimum values between unsigned elements of
+ either vector are copied to the output vector
+ Arguments : Inputs - in0, in1, min_vec
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Minimum of unsigned halfword element values from 'in0' and
+ 'min_vec' are written to output vector 'in0'
+*/
+#define MIN_UH2(RTYPE, in0, in1, min_vec) \
+ { \
+ in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
+ in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
+ }
+#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
+
+#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
+ { \
+ MIN_UH2(RTYPE, in0, in1, min_vec); \
+ MIN_UH2(RTYPE, in2, in3, min_vec); \
+ }
+#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+ between 0 & 255
+ Arguments : Input - in
+ Output - out_m
+ Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in) \
+ ({ \
+ v8i16 max_m = __msa_ldi_h(255); \
+ v8i16 out_m; \
+ \
+ out_m = __msa_maxi_s_h((v8i16)in, 0); \
+ out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
+ out_m; \
+ })
+#define CLIP_SH2_0_255(in0, in1) \
+ { \
+ in0 = CLIP_SH_0_255(in0); \
+ in1 = CLIP_SH_0_255(in1); \
+ }
+#define CLIP_SH4_0_255(in0, in1, in2, in3) \
+ { \
+ CLIP_SH2_0_255(in0, in1); \
+ CLIP_SH2_0_255(in2, in3); \
+ }
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+ Arguments : Input - in (signed word vector)
+ Output - sum_m (i32 sum)
+ Return Type - signed word (GP)
+ Details : 4 signed word elements of 'in' vector are added together and
+ the resulting integer sum is returned
+*/
+#define HADD_SW_S32(in) \
+ ({ \
+ v2i64 hadd_sw_s32_res0_m, hadd_sw_s32_res1_m; \
+ int32_t hadd_sw_s32_sum_m; \
+ \
+ hadd_sw_s32_res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
+ hadd_sw_s32_res1_m = __msa_splati_d(hadd_sw_s32_res0_m, 1); \
+ hadd_sw_s32_res0_m = hadd_sw_s32_res0_m + hadd_sw_s32_res1_m; \
+ hadd_sw_s32_sum_m = __msa_copy_s_w((v4i32)hadd_sw_s32_res0_m, 0); \
+ hadd_sw_s32_sum_m; \
+ })
+
+/* Description : Horizontal addition of 4 unsigned word elements
+ Arguments : Input - in (unsigned word vector)
+ Output - sum_m (u32 sum)
+ Return Type - unsigned word (GP)
+ Details : 4 unsigned word elements of 'in' vector are added together and
+ the resulting integer sum is returned
+*/
+#define HADD_UW_U32(in) \
+ ({ \
+ v2u64 hadd_uw_u32_res0_m, hadd_uw_u32_res1_m; \
+ uint32_t hadd_uw_u32_sum_m; \
+ \
+ hadd_uw_u32_res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \
+ hadd_uw_u32_res1_m = (v2u64)__msa_splati_d((v2i64)hadd_uw_u32_res0_m, 1); \
+ hadd_uw_u32_res0_m += hadd_uw_u32_res1_m; \
+ hadd_uw_u32_sum_m = __msa_copy_u_w((v4i32)hadd_uw_u32_res0_m, 0); \
+ hadd_uw_u32_sum_m; \
+ })
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+ Arguments : Input - in (unsigned halfword vector)
+ Output - sum_m (u32 sum)
+ Return Type - unsigned word
+ Details : 8 unsigned halfword elements of 'in' vector are added
+ together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in) \
+ ({ \
+ v4u32 hadd_uh_u32_res_m; \
+ uint32_t hadd_uh_u32_sum_m; \
+ \
+ hadd_uh_u32_res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
+ hadd_uh_u32_sum_m = HADD_UW_U32(hadd_uh_u32_res_m); \
+ hadd_uh_u32_sum_m; \
+ })
+
+/* Description : Horizontal addition of unsigned byte vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each unsigned odd byte element from 'in0' is added to
+ even unsigned byte element from 'in0' (pairwise) and the
+ halfword result is written to 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
+ out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
+ }
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ HADD_UB2(RTYPE, in0, in1, out0, out1); \
+ HADD_UB2(RTYPE, in2, in3, out2, out3); \
+ }
+#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each unsigned odd byte element from 'in0' is subtracted from
+ even unsigned byte element from 'in0' (pairwise) and the
+ halfword result is written to 'out0'
+*/
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
+ out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
+ }
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+
+/* Description : SAD (Sum of Absolute Difference)
+ Arguments : Inputs - in0, in1, ref0, ref1
+ Outputs - sad_m (halfword vector)
+ Return Type - unsigned halfword
+ Details : Absolute difference of all the byte elements from 'in0' with
+ 'ref0' is calculated and preserved in 'diff0'. Then even-odd
+ pairs are added together to generate 8 halfword results.
+*/
+#define SAD_UB2_UH(in0, in1, ref0, ref1) \
+ ({ \
+ v16u8 diff0_m, diff1_m; \
+ v8u16 sad_m = { 0 }; \
+ \
+ diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \
+ diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \
+ \
+ sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
+ sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
+ \
+ sad_m; \
+ })
+
+/* Description : Horizontal subtraction of signed halfword vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each signed odd halfword element from 'in0' is subtracted from
+ even signed halfword element from 'in0' (pairwise) and the
+ word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
+ out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
+ }
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
+/* Description : Set element n input vector to GPR value
+ Arguments : Inputs - in0, in1, in2, in3
+ Output - out
+ Return Type - as per RTYPE
+ Details : Set element 0 in vector 'out' to value specified in 'in0'
+*/
+#define INSERT_W2(RTYPE, in0, in1, out) \
+ { \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+ }
+#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
+
+#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
+ { \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
+ }
+#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
+#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
+
+#define INSERT_D2(RTYPE, in0, in1, out) \
+ { \
+ out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
+ out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
+ }
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even byte elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+ }
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even halfword elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
+ out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
+ }
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
+ out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
+ }
+#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even double word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
+ out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
+ }
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of byte elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
+ }
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
+#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
+
+/* Description : Interleave left half of halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of halfword elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
+ }
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
+ }
+#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of byte elements of 'in0' and 'in1' are interleaved
+ and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
+ }
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+
+#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
+ in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \
+ out5, out6, out7) \
+ { \
+ ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3); \
+ ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \
+ out6, out7); \
+ }
+#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of halfword elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
+ }
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
+ }
+#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of double word elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
+ out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
+ }
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
+ { \
+ ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \
+ }
+#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
+
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of byte elements from 'in0' and 'in1' are
+ interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+ }
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+ }
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+ }
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+ unsigned value of (sat_val + 1) bits
+ The element data width remains unchanged
+ Arguments : Inputs - in0, in1, sat_val
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned halfword element from 'in0' is saturated to the
+ value generated with (sat_val + 1) bit range.
+ The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val) \
+ { \
+ in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
+ in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
+ }
+#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
+
+#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
+ { \
+ SAT_UH2(RTYPE, in0, in1, sat_val); \
+ SAT_UH2(RTYPE, in2, in3, sat_val) \
+ }
+#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+ unsigned value of (sat_val + 1) bits
+ The element data width remains unchanged
+ Arguments : Inputs - in0, in1, sat_val
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned halfword element from 'in0' is saturated to the
+ value generated with (sat_val + 1) bit range
+ The results are written in place
+*/
+#define SAT_SH2(RTYPE, in0, in1, sat_val) \
+ { \
+ in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
+ in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
+ }
+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
+
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
+ { \
+ SAT_SH2(RTYPE, in0, in1, sat_val); \
+ SAT_SH2(RTYPE, in2, in3, sat_val); \
+ }
+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Indexed halfword element values are replicated to all
+ elements in output vector
+ Arguments : Inputs - in, idx0, idx1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : 'idx0' element value from 'in' vector is replicated to all
+ elements in 'out0' vector
+ Valid index range for halfword operation is 0-7
+*/
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \
+ out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \
+ }
+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
+
+#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
+ { \
+ SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
+ SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
+ }
+#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
+#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even byte elements of 'in0' are copied to the left half of
+ 'out0' & even byte elements of 'in1' are copied to the right
+ half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
+ }
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even halfword elements of 'in0' are copied to the left half of
+ 'out0' & even halfword elements of 'in1' are copied to the
+ right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
+ }
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even double elements of 'in0' are copied to the left half of
+ 'out0' & even double elements of 'in1' are copied to the right
+ half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
+ out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
+ }
+#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
+
+#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
+
+/* Description : Each byte element is logically xor'ed with immediate 128
+ Arguments : Inputs - in0, in1
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned byte element from input vector 'in0' is
+ logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128(RTYPE, in0, in1) \
+ { \
+ in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
+ in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
+ }
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+
+#define XORI_B3_128(RTYPE, in0, in1, in2) \
+ { \
+ XORI_B2_128(RTYPE, in0, in1); \
+ in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
+ }
+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
+
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
+ { \
+ XORI_B2_128(RTYPE, in0, in1); \
+ XORI_B2_128(RTYPE, in2, in3); \
+ }
+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
+
+#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
+ { \
+ XORI_B4_128(RTYPE, in0, in1, in2, in3); \
+ XORI_B3_128(RTYPE, in4, in5, in6); \
+ }
+#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
+
+/* Description : Average of signed halfword elements -> (a + b) / 2
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3
+ Return Type - as per RTYPE
+ Details : Each signed halfword element from 'in0' is added to each
+ signed halfword element of 'in1' with full precision resulting
+ in one extra bit in the result. The result is then divided by
+ 2 and written to 'out0'
+*/
+#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \
+ out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \
+ out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \
+ }
+#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Addition of signed halfword elements and signed saturation
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed halfword elements from 'in0' are added to signed
+ halfword elements of 'in1'. The result is then signed saturated
+ between halfword data type range
+*/
+#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
+ }
+#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
+
+#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Shift left all elements of vector (generic for all data types)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - in place operation
+ Return Type - as per input vector RTYPE
+ Details : Each element of vector 'in0' is left shifted by 'shift' and
+ the result is written in-place.
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) \
+ { \
+ in0 = in0 << shift; \
+ in1 = in1 << shift; \
+ in2 = in2 << shift; \
+ in3 = in3 << shift; \
+ }
+
+/* Description : Arithmetic shift right all elements of vector
+ (generic for all data types)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - in place operation
+ Return Type - as per input vector RTYPE
+ Details : Each element of vector 'in0' is right shifted by 'shift' and
+ the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_2V(in0, in1, shift) \
+ { \
+ in0 = in0 >> shift; \
+ in1 = in1 >> shift; \
+ }
+
+#define SRA_4V(in0, in1, in2, in3, shift) \
+ { \
+ in0 = in0 >> shift; \
+ in1 = in1 >> shift; \
+ in2 = in2 >> shift; \
+ in3 = in3 >> shift; \
+ }
+
+/* Description : Shift right arithmetic rounded words
+ Arguments : Inputs - in0, in1, shift
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each element of vector 'in0' is shifted right arithmetically by
+ the number of bits in the corresponding element in the vector
+ 'shift'. The last discarded bit is added to shifted value for
+ rounding and the result is written in-place.
+ 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
+ in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
+ }
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRAR_W2(RTYPE, in0, in1, shift) \
+ SRAR_W2(RTYPE, in2, in3, shift) \
+ }
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
+/* Description : Shift right arithmetic rounded (immediate)
+ Arguments : Inputs - in0, in1, shift
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each element of vector 'in0' is shifted right arithmetically by
+ the value in 'shift'. The last discarded bit is added to the
+ shifted value for rounding and the result is written in-place.
+ 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
+ in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
+ }
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRARI_H2(RTYPE, in0, in1, shift); \
+ SRARI_H2(RTYPE, in2, in3, shift); \
+ }
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
+ in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
+ }
+#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRARI_W2(RTYPE, in0, in1, shift); \
+ SRARI_W2(RTYPE, in2, in3, shift); \
+ }
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Logical shift right all elements of vector (immediate)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - out0, out1, out2, out3
+ Return Type - as per RTYPE
+ Details : Each element of vector 'in0' is right shifted by 'shift' and
+ the result is written in-place. 'shift' is an immediate value.
+*/
+#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
+ { \
+ out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \
+ out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \
+ out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \
+ out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \
+ }
+#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element from 'in0' is multiplied with elements from 'in1'
+ and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 * in1; \
+ out1 = in2 * in3; \
+ }
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ MUL2(in0, in1, in2, in3, out0, out1); \
+ MUL2(in4, in5, in6, in7, out2, out3); \
+ }
+
+/* Description : Addition of 2 pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element in 'in0' is added to 'in1' and result is written
+ to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 + in1; \
+ out1 = in2 + in3; \
+ }
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ ADD2(in0, in1, in2, in3, out0, out1); \
+ ADD2(in4, in5, in6, in7, out2, out3); \
+ }
+
+/* Description : Subtraction of 2 pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element in 'in1' is subtracted from 'in0' and result is
+ written to 'out0'.
+*/
+#define SUB2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 - in1; \
+ out1 = in2 - in3; \
+ }
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ out0 = in0 - in1; \
+ out1 = in2 - in3; \
+ out2 = in4 - in5; \
+ out3 = in6 - in7; \
+ }
+
+/* Description : Sign extend halfword elements from right half of the vector
+ Arguments : Input - in (halfword vector)
+ Output - out (sign extended word vector)
+ Return Type - signed word
+ Details : Sign bit of halfword elements from input vector 'in' is
+ extracted and interleaved with same vector 'in0' to generate
+ 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out) \
+ { \
+ v8i16 sign_m; \
+ \
+ sign_m = __msa_clti_s_h((v8i16)in, 0); \
+ out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
+ }
+
+/* Description : Sign extend byte elements from input vector and return
+ halfword results in pair of vectors
+ Arguments : Input - in (byte vector)
+ Outputs - out0, out1 (sign extended halfword vectors)
+ Return Type - signed halfword
+ Details : Sign bit of byte elements from input vector 'in' is
+ extracted and interleaved right with same vector 'in0' to
+ generate 8 signed halfword elements in 'out0'
+ Then interleaved left with same vector 'in0' to
+ generate 8 signed halfword elements in 'out1'
+*/
+#define UNPCK_SB_SH(in, out0, out1) \
+ { \
+ v16i8 tmp_m; \
+ \
+ tmp_m = __msa_clti_s_b((v16i8)in, 0); \
+ ILVRL_B2_SH(tmp_m, in, out0, out1); \
+ }
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+ Arguments : Input - in (unsigned byte vector)
+ Outputs - out0, out1 (unsigned halfword vectors)
+ Return Type - signed halfword
+ Details : Zero extended right half of vector is returned in 'out0'
+ Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH(in, out0, out1) \
+ { \
+ v16i8 zero_m = { 0 }; \
+ \
+ ILVRL_B2_SH(zero_m, in, out0, out1); \
+ }
+
+/* Description : Sign extend halfword elements from input vector and return
+ the result in pair of vectors
+ Arguments : Input - in (halfword vector)
+ Outputs - out0, out1 (sign extended word vectors)
+ Return Type - signed word
+ Details : Sign bit of halfword elements from input vector 'in' is
+ extracted and interleaved right with same vector 'in0' to
+ generate 4 signed word elements in 'out0'
+ Then interleaved left with same vector 'in0' to
+ generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1) \
+ { \
+ v8i16 tmp_m; \
+ \
+ tmp_m = __msa_clti_s_h((v8i16)in, 0); \
+ ILVRL_H2_SW(tmp_m, in, out0, out1); \
+ }
+
+/* Description : Butterfly of 4 input vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Details : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = in0 + in3; \
+ out1 = in1 + in2; \
+ \
+ out2 = in1 - in2; \
+ out3 = in0 - in3; \
+ }
+
+/* Description : Butterfly of 8 input vectors
+ Arguments : Inputs - in0 ... in7
+ Outputs - out0 .. out7
+ Details : Butterfly operation
+*/
+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ out0 = in0 + in7; \
+ out1 = in1 + in6; \
+ out2 = in2 + in5; \
+ out3 = in3 + in4; \
+ \
+ out4 = in3 - in4; \
+ out5 = in2 - in5; \
+ out6 = in1 - in6; \
+ out7 = in0 - in7; \
+ }
+
+/* Description : Butterfly of 16 input vectors
+ Arguments : Inputs - in0 ... in15
+ Outputs - out0 .. out15
+ Details : Butterfly operation
+*/
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
+ in11, in12, in13, in14, in15, out0, out1, out2, out3, \
+ out4, out5, out6, out7, out8, out9, out10, out11, out12, \
+ out13, out14, out15) \
+ { \
+ out0 = in0 + in15; \
+ out1 = in1 + in14; \
+ out2 = in2 + in13; \
+ out3 = in3 + in12; \
+ out4 = in4 + in11; \
+ out5 = in5 + in10; \
+ out6 = in6 + in9; \
+ out7 = in7 + in8; \
+ \
+ out8 = in7 - in8; \
+ out9 = in6 - in9; \
+ out10 = in5 - in10; \
+ out11 = in4 - in11; \
+ out12 = in3 - in12; \
+ out13 = in2 - in13; \
+ out14 = in1 - in14; \
+ out15 = in0 - in15; \
+ }
+
+/* Description : Transpose input 8x8 byte block
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+ out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
+ tmp3_m); \
+ ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
+ ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
+ ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
+ ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
+ SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
+ SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
+ }
+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
+ in10, in11, in12, in13, in14, in15, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
+ ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
+ ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
+ ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
+ \
+ tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \
+ tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \
+ tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \
+ tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \
+ out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \
+ tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \
+ out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \
+ tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \
+ \
+ ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
+ out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \
+ out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
+ out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
+ out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ }
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 s0_m, s1_m; \
+ \
+ ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
+ ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \
+ }
+
+/* Description : Transpose 4x8 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - signed halfword
+*/
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
+ v8i16 zero_m = { 0 }; \
+ \
+ ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
+ tmp3_n); \
+ ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \
+ ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \
+ \
+ out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
+ out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
+ \
+ out4 = zero_m; \
+ out5 = zero_m; \
+ out6 = zero_m; \
+ out7 = zero_m; \
+ }
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \
+ ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \
+ ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
+ ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
+ }
+
+/* Description : Transpose 8x8 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+ out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ v8i16 s0_m, s1_m; \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
+ ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
+ ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
+ ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
+ PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
+ tmp7_m, out0, out2, out4, out6); \
+ out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
+ out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
+ out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
+ out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
+ }
+#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
+
+/* Description : Transpose 4x4 block with word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
+ ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
+ \
+ out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \
+ out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \
+ out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \
+ out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \
+ }
+
+/* Description : Add block 4x4
+ Arguments : Inputs - in0, in1, in2, in3, pdst, stride
+ Details : Least significant 4 bytes from each input vector are added to
+ the destination bytes, clipped between 0-255 and stored.
+*/
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
+ { \
+ uint32_t src0_m, src1_m, src2_m, src3_m; \
+ v8i16 inp0_m, inp1_m, res0_m, res1_m; \
+ v16i8 dst0_m = { 0 }; \
+ v16i8 dst1_m = { 0 }; \
+ v16i8 zero_m = { 0 }; \
+ \
+ ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
+ LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
+ INSERT_W2_SB(src0_m, src1_m, dst0_m); \
+ INSERT_W2_SB(src2_m, src3_m, dst1_m); \
+ ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
+ ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
+ CLIP_SH2_0_255(res0_m, res1_m); \
+ PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
+ ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \
+ }
+
+/* Description : Pack even elements of input vectors & xor with 128
+ Arguments : Inputs - in0, in1
+ Output - out_m
+ Return Type - unsigned byte
+ Details : Signed byte even elements from 'in0' and 'in1' are packed
+ together in one vector and the resulting vector is xor'ed with
+ 128 to shift the range from signed to unsigned byte
+*/
+#define PCKEV_XORI128_UB(in0, in1) \
+ ({ \
+ v16u8 out_m; \
+ \
+ out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
+ out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \
+ out_m; \
+ })
+
+/* Description : Converts inputs to unsigned bytes, interleave, average & store
+ as 8x4 unsigned byte block
+ Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride
+*/
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
+ { \
+ v16u8 tmp0_m, tmp1_m; \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ \
+ tmp0_m = PCKEV_XORI128_UB(in0, in1); \
+ tmp1_m = PCKEV_XORI128_UB(in2, in3); \
+ AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
+ }
+
+/* Description : Pack even byte elements and store byte vector in destination
+ memory
+ Arguments : Inputs - in0, in1, pdst
+*/
+#define PCKEV_ST_SB(in0, in1, pdst) \
+ { \
+ v16i8 tmp_m; \
+ \
+ tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
+ ST_SB(tmp_m, (pdst)); \
+ }
+
+/* Description : Horizontal 2 tap filter kernel code
+ Arguments : Inputs - in0, in1, mask, coeff, shift
+*/
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
+ ({ \
+ v16i8 tmp0_m; \
+ v8u16 tmp1_m; \
+ \
+ tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
+ tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \
+ tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \
+ \
+ tmp1_m; \
+ })
+#endif // VPX_VPX_DSP_MIPS_MACROS_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c
new file mode 100644
index 0000000000..7f5882bca3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c
@@ -0,0 +1,807 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#define SAD_SRC_REF_ABS_SUB_64 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_32 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_16 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_8 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_REF_ABS_SUB_4 \
+ "ulw %[tmp0], 0x00(%[src]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp1] \n\t" \
+ "ulw %[tmp0], 0x00(%[ref]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp2] \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_REF_ABS_SUB_4 \
+ "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t" \
+ "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \
+ "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+#define SAD_SRC_AVGREF_ABS_SUB_64 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x27(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x20(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x2f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x28(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x37(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x30(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x3f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x38(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_32 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_16 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_8 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_AVGREF_ABS_SUB_4 \
+ "ulw %[tmp0], 0x00(%[second_pred]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp1] \n\t" \
+ "ulw %[tmp0], 0x00(%[ref]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp2] \n\t" \
+ "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_AVGREF_ABS_SUB_4 \
+ "gslwlc1 %[ftmp1], 0x03(%[second_pred]) \n\t" \
+ "gslwrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \
+ "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+#define sadMxNx4D_mmi(m, n) \
+ void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < 4; ++i) \
+ sad_array[i] = \
+ vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \
+ }
+
+static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_64
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_64
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+ /* clang-format on */
+
+ return sad;
+}
+
+#define vpx_sad64xN(H) \
+ unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad64x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad64xN(64);
+vpx_sad64xN(32);
+sadMxNx4D_mmi(64, 64);
+sadMxNx4D_mmi(64, 32);
+
+static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_64
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_64
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"(l_second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+ /* clang-format on */
+
+ return sad;
+}
+
+#define vpx_sad_avg64xN(H) \
+ unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg64xN(64);
+vpx_sad_avg64xN(32);
+
+static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_32
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_32
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+ /* clang-format on */
+
+ return sad;
+}
+
+#define vpx_sad32xN(H) \
+ unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad32x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad32xN(64);
+vpx_sad32xN(32);
+vpx_sad32xN(16);
+sadMxNx4D_mmi(32, 64);
+sadMxNx4D_mmi(32, 32);
+sadMxNx4D_mmi(32, 16);
+
+static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_32
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_32
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"(l_second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+ /* clang-format on */
+
+ return sad;
+}
+
+#define vpx_sad_avg32xN(H) \
+ unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg32xN(64);
+vpx_sad_avg32xN(32);
+vpx_sad_avg32xN(16);
+
+static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_16
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_16
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+ /* clang-format on */
+
+ return sad;
+}
+
+#define vpx_sad16xN(H) \
+ unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad16x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad16xN(32);
+vpx_sad16xN(16);
+vpx_sad16xN(8);
+sadMxNx4D_mmi(16, 32);
+sadMxNx4D_mmi(16, 16);
+sadMxNx4D_mmi(16, 8);
+
+static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_16
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_16
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"(l_second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+ /* clang-format on */
+
+ return sad;
+}
+
+#define vpx_sad_avg16xN(H) \
+ unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg16xN(32);
+vpx_sad_avg16xN(16);
+vpx_sad_avg16xN(8);
+
+static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_8
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_8
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+ /* clang-format on */
+
+ return sad;
+}
+
+#define vpx_sad8xN(H) \
+ unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad8x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad8xN(16);
+vpx_sad8xN(8);
+vpx_sad8xN(4);
+sadMxNx4D_mmi(8, 16);
+sadMxNx4D_mmi(8, 8);
+sadMxNx4D_mmi(8, 4);
+
+static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_8
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_8
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"(l_second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+ /* clang-format on */
+
+ return sad;
+}
+
+#define vpx_sad_avg8xN(H) \
+ unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg8xN(16);
+vpx_sad_avg8xN(8);
+vpx_sad_avg8xN(4);
+
+static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_4
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_4
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+ /* clang-format on */
+
+ return sad;
+}
+
+#define vpx_sad4xN(H) \
+ unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad4x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad4xN(8);
+vpx_sad4xN(4);
+sadMxNx4D_mmi(4, 8);
+sadMxNx4D_mmi(4, 4);
+
+static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+ mips_reg l_second_pred = (mips_reg)second_pred;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_4
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_4
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"(l_second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+ /* clang-format on */
+
+ return sad;
+}
+
+#define vpx_sad_avg4xN(H) \
+ unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg4xN(8);
+vpx_sad_avg4xN(4);
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c
new file mode 100644
index 0000000000..b0f8ff1fd9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c
@@ -0,0 +1,804 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \
+ { \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
+ out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
+ }
+#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
+
+static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v16u8 diff;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+ diff = __msa_asub_u_b(src, ref);
+ sad += __msa_hadd_u_h(diff, diff);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+ LD_UB2(ref, ref_stride, ref0, ref1);
+ ref += (2 * ref_stride);
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+ LD_UB2(ref, ref_stride, ref0, ref1);
+ ref += (2 * ref_stride);
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_UB2(ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_UB2(ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_UB2(ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_UB2(ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ uint32_t sad = 0;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+
+ for (ht_cnt = (height >> 1); ht_cnt--;) {
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+ }
+
+ sad = HADD_UH_U32(sad0);
+ sad += HADD_UH_U32(sad1);
+
+ return sad;
+}
+
+static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v16u8 diff;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ src_ptr += (4 * src_stride);
+
+ LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ref0_ptr += (4 * ref_stride);
+
+ diff = __msa_asub_u_b(src, ref);
+ sad0 += __msa_hadd_u_h(diff, diff);
+
+ LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ref1_ptr += (4 * ref_stride);
+
+ diff = __msa_asub_u_b(src, ref);
+ sad1 += __msa_hadd_u_h(diff, diff);
+
+ LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ref2_ptr += (4 * ref_stride);
+
+ diff = __msa_asub_u_b(src, ref);
+ sad2 += __msa_hadd_u_h(diff, diff);
+
+ LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ref3_ptr += (4 * ref_stride);
+
+ diff = __msa_asub_u_b(src, ref);
+ sad3 += __msa_hadd_u_h(diff, diff);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ int32_t ht_cnt;
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref0_ptr += (4 * ref_stride);
+ LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
+ ref1_ptr += (4 * ref_stride);
+ LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
+ ref2_ptr += (4 * ref_stride);
+ LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
+ ref3_ptr += (4 * ref_stride);
+
+ PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
+ sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
+ sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
+ sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ int32_t ht_cnt;
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ v16u8 src, ref0, ref1, ref2, ref3, diff;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (ht_cnt = (height >> 1); ht_cnt--;) {
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref0 = LD_UB(ref0_ptr);
+ ref0_ptr += ref_stride;
+ ref1 = LD_UB(ref1_ptr);
+ ref1_ptr += ref_stride;
+ ref2 = LD_UB(ref2_ptr);
+ ref2_ptr += ref_stride;
+ ref3 = LD_UB(ref3_ptr);
+ ref3_ptr += ref_stride;
+
+ diff = __msa_asub_u_b(src, ref0);
+ sad0 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref1);
+ sad1 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref2);
+ sad2 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref3);
+ sad3 += __msa_hadd_u_h(diff, diff);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref0 = LD_UB(ref0_ptr);
+ ref0_ptr += ref_stride;
+ ref1 = LD_UB(ref1_ptr);
+ ref1_ptr += ref_stride;
+ ref2 = LD_UB(ref2_ptr);
+ ref2_ptr += ref_stride;
+ ref3 = LD_UB(ref3_ptr);
+ ref3_ptr += ref_stride;
+
+ diff = __msa_asub_u_b(src, ref0);
+ sad0 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref1);
+ sad1 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref2);
+ sad2 += __msa_hadd_u_h(diff, diff);
+ diff = __msa_asub_u_b(src, ref3);
+ sad3 += __msa_hadd_u_h(diff, diff);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v8u16 sad2 = { 0 };
+ v8u16 sad3 = { 0 };
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (ht_cnt = height; ht_cnt--;) {
+ LD_UB2(src, 16, src0, src1);
+ src += src_stride;
+
+ LD_UB2(ref0_ptr, 16, ref0, ref1);
+ ref0_ptr += ref_stride;
+ sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(ref1_ptr, 16, ref0, ref1);
+ ref1_ptr += ref_stride;
+ sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(ref2_ptr, 16, ref0, ref1);
+ ref2_ptr += ref_stride;
+ sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(ref3_ptr, 16, ref0, ref1);
+ ref3_ptr += ref_stride;
+ sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ sad_array[0] = HADD_UH_U32(sad0);
+ sad_array[1] = HADD_UH_U32(sad1);
+ sad_array[2] = HADD_UH_U32(sad2);
+ sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8u16 sad0_0 = { 0 };
+ v8u16 sad0_1 = { 0 };
+ v8u16 sad1_0 = { 0 };
+ v8u16 sad1_1 = { 0 };
+ v8u16 sad2_0 = { 0 };
+ v8u16 sad2_1 = { 0 };
+ v8u16 sad3_0 = { 0 };
+ v8u16 sad3_1 = { 0 };
+ v4u32 sad;
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (ht_cnt = height; ht_cnt--;) {
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+
+ LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
+ ref0_ptr += ref_stride;
+ sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
+ ref1_ptr += ref_stride;
+ sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
+ ref2_ptr += ref_stride;
+ sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+ LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
+ ref3_ptr += ref_stride;
+ sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+ }
+
+ sad = __msa_hadd_u_w(sad0_0, sad0_0);
+ sad += __msa_hadd_u_w(sad0_1, sad0_1);
+ sad_array[0] = HADD_UW_U32(sad);
+
+ sad = __msa_hadd_u_w(sad1_0, sad1_0);
+ sad += __msa_hadd_u_w(sad1_1, sad1_1);
+ sad_array[1] = HADD_UW_U32(sad);
+
+ sad = __msa_hadd_u_w(sad2_0, sad2_0);
+ sad += __msa_hadd_u_w(sad2_1, sad2_1);
+ sad_array[2] = HADD_UW_U32(sad);
+
+ sad = __msa_hadd_u_w(sad3_0, sad3_0);
+ sad += __msa_hadd_u_w(sad3_1, sad3_1);
+ sad_array[3] = HADD_UW_U32(sad);
+}
+
+static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v16u8 diff, pred, comp;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+ comp = __msa_aver_u_b(pred, ref);
+ diff = __msa_asub_u_b(src, comp);
+ sad += __msa_hadd_u_h(diff, diff);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v16u8 diff0, diff1, pred0, pred1;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
+ AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
+ sad += SAD_UB2_UH(src0, src1, diff0, diff1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 3); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += (4 * 16);
+ AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+ sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += (4 * 16);
+ AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+ sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 comp0, comp1;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src2, src4, src6);
+ LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
+ LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
+ ref += (4 * ref_stride);
+
+ LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
+ LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
+ sec_pred += (4 * 32);
+
+ AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+ sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+ AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
+ sad += SAD_UB2_UH(src4, src5, comp0, comp1);
+ AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
+ sad += SAD_UB2_UH(src6, src7, comp0, comp1);
+ }
+
+ return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 comp0, comp1, comp2, comp3;
+ v16u8 pred0, pred1, pred2, pred3;
+ v8u16 sad0 = { 0 };
+ v8u16 sad1 = { 0 };
+ v4u32 sad;
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
+ sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
+ sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
+ sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref += ref_stride;
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+ comp1, comp2, comp3);
+ sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+ }
+
+ sad = __msa_hadd_u_w(sad0, sad0);
+ sad += __msa_hadd_u_w(sad1, sad1);
+
+ return HADD_SW_S32(sad);
+}
+
+#define VPX_SAD_4xHEIGHT_MSA(height) \
+ uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_8xHEIGHT_MSA(height) \
+ uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_16xHEIGHT_MSA(height) \
+ uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_32xHEIGHT_MSA(height) \
+ uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_64xHEIGHT_MSA(height) \
+ uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_4xHEIGHTx4D_MSA(height) \
+ void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
+ sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_SAD_8xHEIGHTx4D_MSA(height) \
+ void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
+ sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_SAD_16xHEIGHTx4D_MSA(height) \
+ void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
+ sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_SAD_32xHEIGHTx4D_MSA(height) \
+ void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
+ sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_SAD_64xHEIGHTx4D_MSA(height) \
+ void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
+ sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_AVGSAD_4xHEIGHT_MSA(height) \
+ uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ const uint8_t *second_pred) { \
+ return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define VPX_AVGSAD_8xHEIGHT_MSA(height) \
+ uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride, \
+ const uint8_t *second_pred) { \
+ return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define VPX_AVGSAD_16xHEIGHT_MSA(height) \
+ uint32_t vpx_sad16x##height##_avg_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define VPX_AVGSAD_32xHEIGHT_MSA(height) \
+ uint32_t vpx_sad32x##height##_avg_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define VPX_AVGSAD_64xHEIGHT_MSA(height) \
+ uint32_t vpx_sad64x##height##_avg_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+// 64x64
+VPX_SAD_64xHEIGHT_MSA(64);
+VPX_SAD_64xHEIGHTx4D_MSA(64);
+VPX_AVGSAD_64xHEIGHT_MSA(64);
+
+// 64x32
+VPX_SAD_64xHEIGHT_MSA(32);
+VPX_SAD_64xHEIGHTx4D_MSA(32);
+VPX_AVGSAD_64xHEIGHT_MSA(32);
+
+// 32x64
+VPX_SAD_32xHEIGHT_MSA(64);
+VPX_SAD_32xHEIGHTx4D_MSA(64);
+VPX_AVGSAD_32xHEIGHT_MSA(64);
+
+// 32x32
+VPX_SAD_32xHEIGHT_MSA(32);
+VPX_SAD_32xHEIGHTx4D_MSA(32);
+VPX_AVGSAD_32xHEIGHT_MSA(32);
+
+// 32x16
+VPX_SAD_32xHEIGHT_MSA(16);
+VPX_SAD_32xHEIGHTx4D_MSA(16);
+VPX_AVGSAD_32xHEIGHT_MSA(16);
+
+// 16x32
+VPX_SAD_16xHEIGHT_MSA(32);
+VPX_SAD_16xHEIGHTx4D_MSA(32);
+VPX_AVGSAD_16xHEIGHT_MSA(32);
+
+// 16x16
+VPX_SAD_16xHEIGHT_MSA(16);
+VPX_SAD_16xHEIGHTx4D_MSA(16);
+VPX_AVGSAD_16xHEIGHT_MSA(16);
+
+// 16x8
+VPX_SAD_16xHEIGHT_MSA(8);
+VPX_SAD_16xHEIGHTx4D_MSA(8);
+VPX_AVGSAD_16xHEIGHT_MSA(8);
+
+// 8x16
+VPX_SAD_8xHEIGHT_MSA(16);
+VPX_SAD_8xHEIGHTx4D_MSA(16);
+VPX_AVGSAD_8xHEIGHT_MSA(16);
+
+// 8x8
+VPX_SAD_8xHEIGHT_MSA(8);
+VPX_SAD_8xHEIGHTx4D_MSA(8);
+VPX_AVGSAD_8xHEIGHT_MSA(8);
+
+// 8x4
+VPX_SAD_8xHEIGHT_MSA(4);
+VPX_SAD_8xHEIGHTx4D_MSA(4);
+VPX_AVGSAD_8xHEIGHT_MSA(4);
+
+// 4x8
+VPX_SAD_4xHEIGHT_MSA(8);
+VPX_SAD_4xHEIGHTx4D_MSA(8);
+VPX_AVGSAD_4xHEIGHT_MSA(8);
+
+// 4x4
+VPX_SAD_4xHEIGHT_MSA(4);
+VPX_SAD_4xHEIGHTx4D_MSA(4);
+VPX_AVGSAD_4xHEIGHT_MSA(4);
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c
new file mode 100644
index 0000000000..572fcabfc0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c
@@ -0,0 +1,1789 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters_msa[8][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) \
+ { \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
+ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
+ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+ \
+ (sub) += res_l0_m + res_l1_m; \
+ }
+
+#define VARIANCE_WxH(sse, diff, shift) \
+ (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+ (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t height,
+ int32_t *diff) {
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 pred, src = { 0 };
+ v16u8 ref = { 0 };
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+ src = __msa_aver_u_b(src, pred);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t height,
+ int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred,
+ int32_t height, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src, ref, pred;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ src = __msa_aver_u_b(src, pred);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ src = __msa_aver_u_b(src, pred);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ src = __msa_aver_u_b(src, pred);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ src = __msa_aver_u_b(src, pred);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred,
+ int32_t height, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1, pred0, pred1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1, pred0, pred1;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 16; ht_cnt--;) {
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, pred2, pred3;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 16; ht_cnt--;) {
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src2, ref2, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src2, ref2, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src3, ref3, var, avg1);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, pred2, pred3;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v8i16 avg2 = { 0 };
+ v8i16 avg3 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 32; ht_cnt--;) {
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+ LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+ src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+ vec += __msa_hadd_s_w(avg2, avg2);
+ vec += __msa_hadd_s_w(avg3, avg3);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_4width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 filt0, ref = { 0 };
+ v16i8 src0, src1, src2, src3;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 vec0, vec1, vec2, vec3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
+ ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+ src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+ CALC_MSE_AVG_B(src0, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 filt0, out, ref0, ref1, ref2, ref3;
+ v16i8 src0, src1, src2, src3;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 vec0, vec1, vec2, vec3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
+ out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+ CALC_MSE_AVG_B(out, ref0, var, avg);
+ out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+ CALC_MSE_AVG_B(out, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v16u8 dst0, dst1, dst2, dst3, filt0;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
+ src2, src3);
+ CALC_MSE_AVG_B(src0, dst0, var, avg);
+ CALC_MSE_AVG_B(src1, dst1, var, avg);
+ CALC_MSE_AVG_B(src2, dst2, var, avg);
+ CALC_MSE_AVG_B(src3, dst3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4, out;
+ v16u8 src10_r, src32_r, src21_r, src43_r;
+ v16u8 ref = { 0 };
+ v16u8 src2110, src4332;
+ v16u8 filt0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+ v8u16 tmp0, tmp1;
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out0, out1, out2, out3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+ src0 = src4;
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out, ref = { 0 };
+ v16u8 filt_vt, filt_hz, vec0, vec1;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
+ v8u16 tmp0, tmp1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out0, out1;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 hz_out0, hz_out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt_vt, filt_hz, vec0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
+ v8u16 tmp0, tmp1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ LD_UB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src0, src2, src4, src6);
+ LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ CALC_MSE_AVG_B(src2, ref2, var, avg);
+ CALC_MSE_AVG_B(src3, ref3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height,
+ &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height,
+ &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 out, pred, filt0, ref = { 0 };
+ v16i8 src0, src1, src2, src3;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 vec0, vec1, vec2, vec3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
+ ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+ out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+ out = __msa_aver_u_b(out, pred);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 out, pred, filt0;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16i8 src0, src1, src2, src3;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 vec0, vec1, vec2, vec3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+ src2, src3);
+ out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ out = __msa_aver_u_b(out, pred);
+ CALC_MSE_AVG_B(out, ref0, var, avg);
+ out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ out = __msa_aver_u_b(out, pred);
+ CALC_MSE_AVG_B(out, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff, int32_t width) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v16u8 dst0, dst1, dst2, dst3;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v16u8 pred0, pred1, pred2, pred3, filt0;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ dst += (4 * dst_stride);
+ LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+ sec_pred += (4 * width);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
+ tmp2, tmp3);
+ AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
+ tmp2, tmp3);
+
+ CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+ CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+ CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+ CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+ sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 32);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 src10_r, src32_r, src21_r, src43_r;
+ v16u8 out, pred, ref = { 0 };
+ v16u8 src2110, src4332, filt0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+ v8u16 tmp0, tmp1;
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ out = __msa_aver_u_b(out, pred);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, filt0;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+ AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff, int32_t width) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, pred2, pred3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out0, out1, out2, out3, filt0;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+ sec_pred += (4 * width);
+
+ ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+ src0 = src4;
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+ out2, out3);
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+ sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 32);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v16u8 out, pred, ref = { 0 };
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ pred = LD_UB(sec_pred);
+ sec_pred += 16;
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ out = __msa_aver_u_b(out, pred);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 pred0, pred1, out0, out1;
+ v16u8 filt_hz, filt_vt, vec0;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB2(sec_pred, 16, pred0, pred1);
+ sec_pred += 32;
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 pred0, pred1, pred2, pred3;
+ v16u8 out0, out1, out2, out3;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ LD_UB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src0, src2, src4, src6);
+ LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+ sec_pred += (4 * width);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+ out2, out3);
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+ sec_pred, filter_horiz, filter_vert,
+ height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+ sec_pred, filter_horiz, filter_vert,
+ height, &diff0[loop_cnt], 32);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+ sec_pred, filter_horiz, filter_vert,
+ height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
+ uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \
+ const uint8_t *src, int32_t src_stride, int32_t x_offset, \
+ int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sse) { \
+ int32_t diff; \
+ uint32_t var; \
+ const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \
+ const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \
+ \
+ if (y_offset) { \
+ if (x_offset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \
+ src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_sse_diff_##wd##width_v_msa( \
+ src, src_stride, ref, ref_stride, v_filter, ht, &diff); \
+ } \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ if (x_offset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_h_msa( \
+ src, src_stride, ref, ref_stride, h_filter, ht, &diff); \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
+ sse); \
+ } \
+ } \
+ \
+ return var; \
+ }
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \
+ uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \
+ const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \
+ int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \
+ uint32_t *sse, const uint8_t *sec_pred) { \
+ int32_t diff; \
+ const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \
+ const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \
+ \
+ if (y_offset) { \
+ if (x_offset) { \
+ *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
+ v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+ &diff); \
+ } \
+ } else { \
+ if (x_offset) { \
+ *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+ &diff); \
+ } else { \
+ *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \
+ ref_stride, sec_pred, ht, &diff); \
+ } \
+ } \
+ \
+ return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ }
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
+
+uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ int32_t x_offset, int32_t y_offset,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride, uint32_t *sse,
+ const uint8_t *sec_pred) {
+ int32_t diff;
+ const uint8_t *h_filter = bilinear_filters_msa[x_offset];
+ const uint8_t *v_filter = bilinear_filters_msa[y_offset];
+
+ if (y_offset) {
+ if (x_offset) {
+ *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
+ v_filter, 64, &diff);
+ } else {
+ *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
+ ref_stride, sec_pred,
+ v_filter, 64, &diff);
+ }
+ } else {
+ if (x_offset) {
+ *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
+ ref_stride, sec_pred,
+ h_filter, 64, &diff);
+ } else {
+ *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
+ sec_pred, &diff);
+ }
+ }
+
+ return VARIANCE_32Wx64H(*sse, diff);
+}
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \
+ uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \
+ const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \
+ int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \
+ uint32_t *sse, const uint8_t *sec_pred) { \
+ int32_t diff; \
+ const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \
+ const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \
+ \
+ if (y_offset) { \
+ if (x_offset) { \
+ *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
+ v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_avg_sse_diff_64width_v_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+ &diff); \
+ } \
+ } else { \
+ if (x_offset) { \
+ *sse = sub_pixel_avg_sse_diff_64width_h_msa( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+ &diff); \
+ } else { \
+ *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \
+ ref_stride, sec_pred, &diff); \
+ } \
+ } \
+ \
+ return VARIANCE_64Wx##ht##H(*sse, diff); \
+ }
+
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c
new file mode 100644
index 0000000000..8bd7e6977c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
+ double ftmp[13];
+ uint32_t tmp[1];
+
+ if (rows == cols) {
+ switch (rows) {
+ case 4:
+ __asm__ volatile(
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[src]) \n\t"
+ "mtc1 %[tmp0], %[ftmp1] \n\t"
+ "ulw %[tmp0], 0x00(%[pred]) \n\t"
+ "mtc1 %[tmp0], %[ftmp2] \n\t"
+#else
+ "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t"
+ "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t"
+ "gslwlc1 %[ftmp2], 0x03(%[pred]) \n\t"
+ "gslwrc1 %[ftmp2], 0x00(%[pred]) \n\t"
+#endif
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[src]) \n\t"
+ "mtc1 %[tmp0], %[ftmp3] \n\t"
+ "ulw %[tmp0], 0x00(%[pred]) \n\t"
+ "mtc1 %[tmp0], %[ftmp4] \n\t"
+#else
+ "gslwlc1 %[ftmp3], 0x03(%[src]) \n\t"
+ "gslwrc1 %[ftmp3], 0x00(%[src]) \n\t"
+ "gslwlc1 %[ftmp4], 0x03(%[pred]) \n\t"
+ "gslwrc1 %[ftmp4], 0x00(%[pred]) \n\t"
+#endif
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[src]) \n\t"
+ "mtc1 %[tmp0], %[ftmp5] \n\t"
+ "ulw %[tmp0], 0x00(%[pred]) \n\t"
+ "mtc1 %[tmp0], %[ftmp6] \n\t"
+#else
+ "gslwlc1 %[ftmp5], 0x03(%[src]) \n\t"
+ "gslwrc1 %[ftmp5], 0x00(%[src]) \n\t"
+ "gslwlc1 %[ftmp6], 0x03(%[pred]) \n\t"
+ "gslwrc1 %[ftmp6], 0x00(%[pred]) \n\t"
+#endif
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[src]) \n\t"
+ "mtc1 %[tmp0], %[ftmp7] \n\t"
+ "ulw %[tmp0], 0x00(%[pred]) \n\t"
+ "mtc1 %[tmp0], %[ftmp8] \n\t"
+#else
+ "gslwlc1 %[ftmp7], 0x03(%[src]) \n\t"
+ "gslwrc1 %[ftmp7], 0x00(%[src]) \n\t"
+ "gslwlc1 %[ftmp8], 0x03(%[pred]) \n\t"
+ "gslwrc1 %[ftmp8], 0x00(%[pred]) \n\t"
+#endif
+ "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp2], %[ftmp0] \n\t"
+ "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t"
+ "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp4], %[ftmp0] \n\t"
+ "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t"
+ "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp6], %[ftmp0] \n\t"
+ "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t"
+ "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp8], %[ftmp0] \n\t"
+ "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t"
+ "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+ [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+ [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+ [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+ [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+#if _MIPS_SIM == _ABIO32
+ [tmp0] "=&r"(tmp[0]),
+#endif
+ [src] "+&r"(src), [pred] "+&r"(pred), [diff] "+&r"(diff)
+ : [src_stride] "r"((mips_reg)src_stride),
+ [pred_stride] "r"((mips_reg)pred_stride),
+ [diff_stride] "r"((mips_reg)(diff_stride * 2))
+ : "memory");
+ break;
+ case 8:
+ __asm__ volatile(
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "li %[tmp0], 0x02 \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp4], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "gsldlc1 %[ftmp7], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp8], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ "bnez %[tmp0], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+ [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+ [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+ [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+ [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src),
+ [pred] "+&r"(pred), [diff] "+&r"(diff)
+ : [pred_stride] "r"((mips_reg)pred_stride),
+ [src_stride] "r"((mips_reg)src_stride),
+ [diff_stride] "r"((mips_reg)(diff_stride * 2))
+ : "memory");
+ break;
+ case 16:
+ __asm__ volatile(
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "li %[tmp0], 0x08 \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t"
+ "gsldlc1 %[ftmp3], 0x0f(%[src]) \n\t"
+ "gsldrc1 %[ftmp3], 0x08(%[src]) \n\t"
+ "gsldlc1 %[ftmp4], 0x0f(%[pred]) \n\t"
+ "gsldrc1 %[ftmp4], 0x08(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t"
+ "gsldlc1 %[ftmp7], 0x0f(%[src]) \n\t"
+ "gsldrc1 %[ftmp7], 0x08(%[src]) \n\t"
+ "gsldlc1 %[ftmp8], 0x0f(%[pred]) \n\t"
+ "gsldrc1 %[ftmp8], 0x08(%[pred]) \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[pred], %[pred], %[pred_stride])
+ "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t"
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t"
+ "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t"
+ "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t"
+ "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t"
+ "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t"
+ MMI_ADDU(%[diff], %[diff], %[diff_stride])
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ "bnez %[tmp0], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+ [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+ [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+ [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+ [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src),
+ [pred] "+&r"(pred), [diff] "+&r"(diff)
+ : [pred_stride] "r"((mips_reg)pred_stride),
+ [src_stride] "r"((mips_reg)src_stride),
+ [diff_stride] "r"((mips_reg)(diff_stride * 2))
+ : "memory");
+ break;
+ case 32:
+ vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+ pred, pred_stride);
+ break;
+ case 64:
+ vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+ pred, pred_stride);
+ break;
+ default:
+ vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+ pred, pred_stride);
+ break;
+ }
+ } else {
+ vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, pred,
+ pred_stride);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/subtract_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/subtract_msa.c
new file mode 100644
index 0000000000..391a7ebf66
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/subtract_msa.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ uint32_t src0, src1, src2, src3;
+ uint32_t pred0, pred1, pred2, pred3;
+ v16i8 src = { 0 };
+ v16i8 pred = { 0 };
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
+ INSERT_W4_SB(src0, src1, src2, src3, src);
+ INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
+ ILVRL_B2_UB(src, pred, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
+}
+
+static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ uint64_t src0, src1, pred0, pred1;
+ v16i8 src = { 0 };
+ v16i8 pred = { 0 };
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ LD2(src_ptr, src_stride, src0, src1);
+ src_ptr += (2 * src_stride);
+ LD2(pred_ptr, pred_stride, pred0, pred1);
+ pred_ptr += (2 * pred_stride);
+
+ INSERT_D2_SB(src0, src1, src);
+ INSERT_D2_SB(pred0, pred1, pred);
+ ILVRL_B2_UB(src, pred, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff_ptr, diff_stride);
+ diff_ptr += (2 * diff_stride);
+ }
+}
+
+static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ int8_t count;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (count = 2; count--;) {
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
+ pred7);
+ pred += (8 * pred_stride);
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+ }
+}
+
+static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ LD_SB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_SB2(src, 16, src2, src3);
+ src += src_stride;
+ LD_SB2(src, 16, src4, src5);
+ src += src_stride;
+ LD_SB2(src, 16, src6, src7);
+ src += src_stride;
+
+ LD_SB2(pred, 16, pred0, pred1);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred2, pred3);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred4, pred5);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred6, pred7);
+ pred += pred_stride;
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+ }
+}
+
+static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 32; loop_cnt--;) {
+ LD_SB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_SB4(src, 16, src4, src5, src6, src7);
+ src += src_stride;
+
+ LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
+ pred += pred_stride;
+ LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
+ pred += pred_stride;
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 32, 8);
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 48, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 32, 8);
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 48, 8);
+ diff += diff_stride;
+ }
+}
+
+void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ if (rows == cols) {
+ switch (rows) {
+ case 4:
+ sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 8:
+ sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 16:
+ sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 32:
+ sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 64:
+ sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ default:
+ vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+ } else {
+ vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c
new file mode 100644
index 0000000000..d4563dc410
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "./macros_msa.h"
+
+uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride,
+ int size) {
+ int row, col;
+ uint64_t ss_res = 0;
+ v4i32 mul0, mul1;
+ v2i64 res0 = { 0 };
+
+ if (4 == size) {
+ uint64_t src0, src1, src2, src3;
+ v8i16 diff0 = { 0 };
+ v8i16 diff1 = { 0 };
+
+ LD4(src, src_stride, src0, src1, src2, src3);
+ INSERT_D2_SH(src0, src1, diff0);
+ INSERT_D2_SH(src2, src3, diff1);
+ DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1);
+ mul0 += mul1;
+ res0 = __msa_hadd_s_d(mul0, mul0);
+ res0 += __msa_splati_d(res0, 1);
+ ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+ } else if (8 == size) {
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ mul0 += mul1;
+ res0 = __msa_hadd_s_d(mul0, mul0);
+ res0 += __msa_splati_d(res0, 1);
+ ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+ } else if (16 == size) {
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += 8 * src_stride;
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ mul0 += mul1;
+ res0 += __msa_hadd_s_d(mul0, mul0);
+
+ res0 += __msa_splati_d(res0, 1);
+ ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+ } else if (0 == (size % 16)) {
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ for (row = 0; row < (size >> 4); row++) {
+ for (col = 0; col < size; col += 16) {
+ const int16_t *src_ptr = src + col;
+ LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
+ src7);
+ DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
+ src6, src7);
+ src_ptr += 8 * src_stride;
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
+ src7);
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
+ src6, src7);
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ mul0 += mul1;
+ res0 += __msa_hadd_s_d(mul0, mul0);
+ }
+
+ src += 16 * src_stride;
+ }
+
+ res0 += __msa_splati_d(res0, 1);
+ ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+ } else {
+ int16_t val;
+
+ for (row = 0; row < size; row++) {
+ for (col = 0; col < size; col++) {
+ val = src[col];
+ ss_res += val * val;
+ }
+
+ src += src_stride;
+ }
+ }
+
+ return ss_res;
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h
new file mode 100644
index 0000000000..f27504a207
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_
+#define VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
+ { \
+ v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \
+ v8i16 k0_m, k1_m, k2_m, zero = { 0 }; \
+ \
+ k0_m = __msa_fill_h(cnst0); \
+ k1_m = __msa_fill_h(cnst1); \
+ k2_m = __msa_ilvev_h((v8i16)k1_m, k0_m); \
+ k0_m = __msa_ilvev_h((v8i16)zero, k0_m); \
+ k1_m = __msa_ilvev_h(k1_m, (v8i16)zero); \
+ \
+ ILVRL_H2_SW(reg1, reg0, s5_m, s4_m); \
+ ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
+ DOTP_SH2_SW(s5_m, s4_m, k0_m, k0_m, s1_m, s0_m); \
+ s1_m = __msa_dpsub_s_w(s1_m, (v8i16)s5_m, k1_m); \
+ s0_m = __msa_dpsub_s_w(s0_m, (v8i16)s4_m, k1_m); \
+ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
+ out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
+ \
+ DOTP_SH2_SW(s3_m, s2_m, k2_m, k2_m, s1_m, s0_m); \
+ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
+ out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
+ }
+
+#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \
+ dst1, dst2, dst3) \
+ { \
+ v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
+ v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
+ \
+ DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \
+ tp4_m); \
+ DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \
+ tp8_m); \
+ BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
+ BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
+ SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
+ SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
+ PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \
+ dst1, dst2, dst3); \
+ }
+
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \
+ ({ \
+ v8i16 dst_m; \
+ v4i32 tp0_m, tp1_m; \
+ \
+ DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \
+ SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \
+ dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
+ \
+ dst_m; \
+ })
+
+#define MADD_SHORT(m0, m1, c0, c1, res0, res1) \
+ { \
+ v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
+ v8i16 madd_s0_m, madd_s1_m; \
+ \
+ ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \
+ madd0_m, madd1_m, madd2_m, madd3_m); \
+ SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
+ }
+
+#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \
+ out2, out3) \
+ { \
+ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
+ \
+ ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
+ ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \
+ cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \
+ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \
+ cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \
+ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
+ }
+#endif // VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c
new file mode 100644
index 0000000000..c2adcfa018
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c
@@ -0,0 +1,1357 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/variance.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+/* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
+ vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
+#define VARIANCE_SSE_SUM_8_FOR_W64 \
+ /* sse */ \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp6] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \
+ \
+ /* sum */ \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \
+ "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \
+ "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \
+ "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \
+ "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \
+ "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
+
+#define VARIANCE_SSE_SUM_4 \
+ /* sse */ \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \
+ "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \
+ \
+ /* sum */ \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
+ "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+
+#define VARIANCE_SSE_SUM_8 \
+ /* sse */ \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \
+ \
+ /* sum */ \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \
+ "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \
+ "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t"
+
+#define VARIANCE_SSE_8 \
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+
+#define VARIANCE_SSE_16 \
+ VARIANCE_SSE_8 \
+ "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \
+ /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \
+ /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[3] */ \
+ "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \
+ "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
+ "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[3] */ \
+ "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \
+ "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
+ "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
+ "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
+ "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \
+ "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \
+ "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \
+ "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
+ "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[4] ~ temp2[7] */ \
+ "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \
+ "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[7] */ \
+ "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \
+ "pand %[ftmp3], %[ftmp3], %[mask] \n\t" \
+ "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
+ "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \
+ "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[4] ~ temp2[7] */ \
+ "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \
+ "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[7] */ \
+ "pand %[ftmp8], %[ftmp8], %[mask] \n\t" \
+ "pand %[ftmp9], %[ftmp9], %[mask] \n\t" \
+ "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
+ "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
+ \
+ /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \
+ "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \
+ "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
+ "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
+ \
+ /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \
+ "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \
+ "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \
+ "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \
+ "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \
+ "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
+ "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
+ \
+ /* calculate: temp2[8] ~ temp2[11] */ \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[12] ~ temp2[15] */ \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[8] ~ temp2[15] */ \
+ "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \
+ "pand %[ftmp5], %[ftmp5], %[mask] \n\t" \
+ "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
+ "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
+ \
+ /* calculate: temp2[8] ~ temp2[11] */ \
+ "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \
+ "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[12] ~ temp2[15] */ \
+ "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \
+ "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[8] ~ temp2[15] */ \
+ "pand %[ftmp10], %[ftmp10], %[mask] \n\t" \
+ "pand %[ftmp11], %[ftmp11], %[mask] \n\t" \
+ "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
+ "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t"
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line,
+ int pixel_step, unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ ref_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+
+ ++src_ptr;
+ }
+
+ src_ptr += src_pixels_per_line - output_width;
+ ref_ptr += output_width;
+ }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ ref_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+ ++src_ptr;
+ }
+
+ src_ptr += src_pixels_per_line - output_width;
+ ref_ptr += output_width;
+ }
+}
+
+static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[12];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x27(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x20(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x27(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x20(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x2f(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x28(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x2f(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x28(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x37(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x30(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x37(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x30(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x3f(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x38(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x3f(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x38(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "mfc1 %[tmp1], %[ftmp9] \n\t"
+ "mfhc1 %[tmp2], %[ftmp9] \n\t"
+ "addu %[sum], %[tmp1], %[tmp2] \n\t"
+ "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
+ "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "swc1 %[ftmp1], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [tmp2]"=&r"(tmp[2]),
+ [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr),
+ [sum]"=&r"(sum)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride),
+ [high]"r"(&high), [sse]"r"(sse)
+ : "memory"
+ );
+ /* clang-format on */
+
+ return *sse - (((int64_t)sum * sum) / (64 * high));
+}
+
+#define VPX_VARIANCE64XN(n) \
+ uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse) { \
+ return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+ }
+
+VPX_VARIANCE64XN(64)
+VPX_VARIANCE64XN(32)
+
+uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse) {
+ int sum;
+ double ftmp[12];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "li %[tmp0], 0x40 \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "mfc1 %[tmp1], %[ftmp9] \n\t"
+ "mfhc1 %[tmp2], %[ftmp9] \n\t"
+ "addu %[sum], %[tmp1], %[tmp2] \n\t"
+ "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
+ "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "swc1 %[ftmp1], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [tmp2]"=&r"(tmp[2]),
+ [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr),
+ [sum]"=&r"(sum)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride),
+ [sse]"r"(sse)
+ : "memory"
+ );
+ /* clang-format on */
+
+ return *sse - (((int64_t)sum * sum) / 2048);
+}
+
+static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[13];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
+ "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+ /* clang-format on */
+
+ return *sse - (((int64_t)sum * sum) / (32 * high));
+}
+
+#define VPX_VARIANCE32XN(n) \
+ uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse) { \
+ return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+ }
+
+VPX_VARIANCE32XN(32)
+VPX_VARIANCE32XN(16)
+
+static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[13];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
+ "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+ /* clang-format on */
+
+ return *sse - (((int64_t)sum * sum) / (16 * high));
+}
+
+#define VPX_VARIANCE16XN(n) \
+ uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse) { \
+ return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+ }
+
+VPX_VARIANCE16XN(32)
+VPX_VARIANCE16XN(16)
+VPX_VARIANCE16XN(8)
+
+static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[13];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
+ "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+ /* clang-format on */
+
+ return *sse - (((int64_t)sum * sum) / (8 * high));
+}
+
+#define VPX_VARIANCE8XN(n) \
+ uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse) { \
+ return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+ }
+
+VPX_VARIANCE8XN(16)
+VPX_VARIANCE8XN(8)
+VPX_VARIANCE8XN(4)
+
+static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[12];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp10] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
+ VARIANCE_SSE_SUM_4
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "ssrld %[ftmp9], %[ftmp6], %[ftmp10] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
+ "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
+ "ssrld %[ftmp0], %[ftmp3], %[ftmp10] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]),
+ [tmp0]"=&r"(tmp[0]),
+ [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+ /* clang-format on */
+
+ return *sse - (((int64_t)sum * sum) / (4 * high));
+}
+
+#define VPX_VARIANCE4XN(n) \
+ uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse) { \
+ return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+ }
+
+VPX_VARIANCE4XN(8)
+VPX_VARIANCE4XN(4)
+
+static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse, uint64_t high) {
+ double ftmp[12];
+ uint32_t tmp[1];
+
+ *sse = 0;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+
+ "1: \n\t"
+ VARIANCE_SSE_16
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride),
+ [high]"r"(&high), [sse]"r"(sse)
+ : "memory"
+ );
+ /* clang-format on */
+
+ return *sse;
+}
+
+#define vpx_mse16xN(n) \
+ uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse) { \
+ return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+ }
+
+vpx_mse16xN(16);
+vpx_mse16xN(8);
+
+static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse, uint64_t high) {
+ double ftmp[12];
+ uint32_t tmp[1];
+
+ *sse = 0;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+
+ "1: \n\t"
+ VARIANCE_SSE_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride),
+ [high]"r"(&high), [sse]"r"(sse)
+ : "memory"
+ );
+ /* clang-format on */
+
+ return *sse;
+}
+
+#define vpx_mse8xN(n) \
+ uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse) { \
+ return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+ }
+
+vpx_mse8xN(16);
+vpx_mse8xN(8);
+
+#define SUBPIX_VAR(W, H) \
+ uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint16_t fdata3[((H) + 1) * (W)]; \
+ uint8_t temp2[(H) * (W)]; \
+ \
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
+ W, bilinear_filters[x_offset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse); \
+ }
+
+SUBPIX_VAR(64, 64)
+SUBPIX_VAR(64, 32)
+SUBPIX_VAR(32, 64)
+SUBPIX_VAR(32, 32)
+SUBPIX_VAR(32, 16)
+SUBPIX_VAR(16, 32)
+
+static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
+ int src_stride, int x_offset,
+ int y_offset, uint8_t *temp2,
+ int counter) {
+ uint8_t *temp2_ptr = temp2;
+ mips_reg l_counter = counter;
+ double ftmp[15];
+ double ff_ph_40, mask;
+ double filter_x0, filter_x1, filter_y0, filter_y1;
+ mips_reg tmp[2];
+ uint64_t x0, x1, y0, y1, all;
+
+ const uint8_t *filter_x = bilinear_filters[x_offset];
+ const uint8_t *filter_y = bilinear_filters[y_offset];
+ x0 = (uint64_t)filter_x[0];
+ x1 = (uint64_t)filter_x[1];
+ y0 = (uint64_t)filter_y[0];
+ y1 = (uint64_t)filter_y[1];
+ all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_MTC1(%[all], %[ftmp14])
+ "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
+ "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x10)
+ MMI_MTC1(%[tmp0], %[mask])
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x07)
+ MMI_MTC1(%[tmp0], %[ftmp14])
+ MMI_LI(%[tmp0], 0x0040004000400040)
+ MMI_MTC1(%[tmp0], %[ff_ph_40])
+ MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+ MMI_MTC1(%[tmp0], %[mask])
+ // fdata3: fdata3[0] ~ fdata3[15]
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+
+ // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15]
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+ // temp2: temp2[0] ~ temp2[15]
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+
+ // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15]
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+ // temp2+16*1: temp2[0] ~ temp2[15]
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+
+ "1: \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+ "addiu %[counter], %[counter], -0x01 \n\t"
+ "bnez %[counter], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+ [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+ [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+ [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+ [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
+ [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+ [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+ [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+ : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+#define SUBPIX_VAR16XN(H) \
+ uint32_t vpx_sub_pixel_variance16x##H##_mmi( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint8_t temp2[16 * (H)]; \
+ var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \
+ ((H)-2) / 2); \
+ \
+ return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse); \
+ }
+
+SUBPIX_VAR16XN(16)
+SUBPIX_VAR16XN(8)
+
+static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
+ int src_stride, int x_offset,
+ int y_offset, uint8_t *temp2,
+ int counter) {
+ uint8_t *temp2_ptr = temp2;
+ mips_reg l_counter = counter;
+ double ftmp[15];
+ mips_reg tmp[2];
+ double ff_ph_40, mask;
+ uint64_t x0, x1, y0, y1, all;
+ double filter_x0, filter_x1, filter_y0, filter_y1;
+ const uint8_t *filter_x = bilinear_filters[x_offset];
+ const uint8_t *filter_y = bilinear_filters[y_offset];
+ x0 = (uint64_t)filter_x[0];
+ x1 = (uint64_t)filter_x[1];
+ y0 = (uint64_t)filter_y[0];
+ y1 = (uint64_t)filter_y[1];
+ all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_MTC1(%[all], %[ftmp14])
+ "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
+ "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x10)
+ MMI_MTC1(%[tmp0], %[mask])
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t"
+ "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
+ "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x07)
+ MMI_MTC1(%[tmp0], %[ftmp14])
+ MMI_LI(%[tmp0], 0x0040004000400040)
+ MMI_MTC1(%[tmp0], %[ff_ph_40])
+ MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+ MMI_MTC1(%[tmp0], %[mask])
+
+ // fdata3: fdata3[0] ~ fdata3[7]
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+
+ // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7]
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+ // temp2: temp2[0] ~ temp2[7]
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+ // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7]
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+ // temp2+8*1: temp2[0] ~ temp2[7]
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+
+ "1: \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+ "addiu %[counter], %[counter], -0x01 \n\t"
+ "bnez %[counter], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+ [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+ [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+ [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+ [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
+ [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+ [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+ [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+ : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+#define SUBPIX_VAR8XN(H) \
+ uint32_t vpx_sub_pixel_variance8x##H##_mmi( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint8_t temp2[8 * (H)]; \
+ var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \
+ ((H)-2) / 2); \
+ \
+ return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse); \
+ }
+
+SUBPIX_VAR8XN(16)
+SUBPIX_VAR8XN(8)
+SUBPIX_VAR8XN(4)
+
+static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
+ int src_stride, int x_offset,
+ int y_offset, uint8_t *temp2,
+ int counter) {
+ uint8_t *temp2_ptr = temp2;
+ mips_reg l_counter = counter;
+ double ftmp[7];
+ mips_reg tmp[2];
+ double ff_ph_40, mask;
+ uint64_t x0, x1, y0, y1, all;
+ double filter_x0, filter_x1, filter_y0, filter_y1;
+ const uint8_t *filter_x = bilinear_filters[x_offset];
+ const uint8_t *filter_y = bilinear_filters[y_offset];
+ x0 = (uint64_t)filter_x[0];
+ x1 = (uint64_t)filter_x[1];
+ y0 = (uint64_t)filter_y[0];
+ y1 = (uint64_t)filter_y[1];
+ all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_MTC1(%[all], %[ftmp6])
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "pshufh %[filter_x0], %[ftmp6], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x10)
+ MMI_MTC1(%[tmp0], %[mask])
+ "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
+ "pshufh %[filter_x1], %[ftmp6], %[ftmp0] \n\t"
+ "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
+ "pshufh %[filter_y0], %[ftmp6], %[ftmp0] \n\t"
+ "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
+ "pshufh %[filter_y1], %[ftmp6], %[ftmp0] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x07)
+ MMI_MTC1(%[tmp0], %[ftmp6])
+ MMI_LI(%[tmp0], 0x0040004000400040)
+ MMI_MTC1(%[tmp0], %[ff_ph_40])
+ MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+ MMI_MTC1(%[tmp0], %[mask])
+ // fdata3: fdata3[0] ~ fdata3[3]
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+
+ // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3]
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+ // temp2: temp2[0] ~ temp2[7]
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+ // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3]
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+ // temp2+4*1: temp2[0] ~ temp2[7]
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+
+ "1: \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+ "addiu %[counter], %[counter], -0x01 \n\t"
+ "bnez %[counter], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
+ [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter),
+ [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+ [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+ [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+ : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+#define SUBPIX_VAR4XN(H) \
+ uint32_t vpx_sub_pixel_variance4x##H##_mmi( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint8_t temp2[4 * (H)]; \
+ var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \
+ ((H)-2) / 2); \
+ \
+ return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse); \
+ }
+
+SUBPIX_VAR4XN(8)
+SUBPIX_VAR4XN(4)
+
+#define SUBPIX_AVG_VAR(W, H) \
+ uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[((H) + 1) * (W)]; \
+ uint8_t temp2[(H) * (W)]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[(H) * (W)]); \
+ \
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
+ W, bilinear_filters[x_offset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \
+ \
+ return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse); \
+ }
+
+SUBPIX_AVG_VAR(64, 64)
+SUBPIX_AVG_VAR(64, 32)
+SUBPIX_AVG_VAR(32, 64)
+SUBPIX_AVG_VAR(32, 32)
+SUBPIX_AVG_VAR(32, 16)
+SUBPIX_AVG_VAR(16, 32)
+SUBPIX_AVG_VAR(16, 16)
+SUBPIX_AVG_VAR(16, 8)
+SUBPIX_AVG_VAR(8, 16)
+SUBPIX_AVG_VAR(8, 8)
+SUBPIX_AVG_VAR(8, 4)
+SUBPIX_AVG_VAR(4, 8)
+SUBPIX_AVG_VAR(4, 4)
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c
new file mode 100644
index 0000000000..444b086a6e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c
@@ -0,0 +1,622 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define CALC_MSE_B(src, ref, var) \
+ { \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
+ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
+ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+ }
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) \
+ { \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
+ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
+ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+ \
+ sub += res_l0_m + res_l1_m; \
+ }
+
+#define VARIANCE_WxH(sse, diff, shift) \
+ (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+ (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ uint32_t src0, src1, src2, src3;
+ uint32_t ref0, ref1, ref2, ref3;
+ int32_t ht_cnt;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src, ref;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 16; ht_cnt--;) {
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 16; ht_cnt--;) {
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src2, ref2, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src2, ref2, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src3, ref3, var, avg1);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t *diff) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8i16 avg0 = { 0 };
+ v8i16 avg1 = { 0 };
+ v8i16 avg2 = { 0 };
+ v8i16 avg3 = { 0 };
+ v4i32 vec, var = { 0 };
+
+ for (ht_cnt = 32; ht_cnt--;) {
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ }
+
+ vec = __msa_hadd_s_w(avg0, avg0);
+ vec += __msa_hadd_s_w(avg1, avg1);
+ vec += __msa_hadd_s_w(avg2, avg2);
+ vec += __msa_hadd_s_w(avg3, avg3);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t get_mb_ss_msa(const int16_t *src) {
+ uint32_t sum, cnt;
+ v8i16 src0, src1, src2, src3;
+ v4i32 src0_l, src1_l, src2_l, src3_l;
+ v4i32 src0_r, src1_r, src2_r, src3_r;
+ v2i64 sq_src_l = { 0 };
+ v2i64 sq_src_r = { 0 };
+
+ for (cnt = 8; cnt--;) {
+ LD_SH4(src, 8, src0, src1, src2, src3);
+ src += 4 * 8;
+
+ UNPCK_SH_SW(src0, src0_l, src0_r);
+ UNPCK_SH_SW(src1, src1_l, src1_r);
+ UNPCK_SH_SW(src2, src2_l, src2_r);
+ UNPCK_SH_SW(src3, src3_l, src3_r);
+
+ DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
+ DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
+ DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
+ DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
+ }
+
+ sq_src_l += __msa_splati_d(sq_src_l, 1);
+ sq_src_r += __msa_splati_d(sq_src_r, 1);
+
+ sum = __msa_copy_s_d(sq_src_l, 0);
+ sum += __msa_copy_s_d(sq_src_r, 0);
+
+ return sum;
+}
+
+static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ uint32_t src0, src1, src2, src3;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v4i32 var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ CALC_MSE_B(src, ref, var);
+ }
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v4i32 var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+ ref0, ref1);
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src1, ref1, var);
+ }
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src, ref;
+ v4i32 var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+ }
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v4i32 var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src1, ref1, var);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src1, ref1, var);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src1, ref1, var);
+
+ LD_UB2(src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ LD_UB2(ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src1, ref1, var);
+ }
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v4i32 var = { 0 };
+
+ for (ht_cnt = height >> 1; ht_cnt--;) {
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src2, ref2, var);
+ CALC_MSE_B(src1, ref1, var);
+ CALC_MSE_B(src3, ref3, var);
+
+ LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+ src_ptr += src_stride;
+ LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src2, ref2, var);
+ CALC_MSE_B(src1, ref1, var);
+ CALC_MSE_B(src3, ref3, var);
+ }
+
+ return HADD_SW_S32(var);
+}
+
+uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride) {
+ uint32_t src0, src1, src2, src3;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16i8 src = { 0 };
+ v16i8 ref = { 0 };
+ v4i32 err0 = { 0 };
+
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ INSERT_W4_SB(src0, src1, src2, src3, src);
+ INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
+ CALC_MSE_B(src, ref, err0);
+
+ return HADD_SW_S32(err0);
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define VPX_VARIANCE_WDXHT_MSA(wd, ht) \
+ uint32_t vpx_variance##wd##x##ht##_msa( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, uint32_t *sse) { \
+ int32_t diff; \
+ \
+ *sse = \
+ sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
+ \
+ return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ }
+
+VPX_VARIANCE_WDXHT_MSA(4, 4);
+VPX_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_VARIANCE_WDXHT_MSA(8, 4)
+VPX_VARIANCE_WDXHT_MSA(8, 8)
+VPX_VARIANCE_WDXHT_MSA(8, 16)
+
+VPX_VARIANCE_WDXHT_MSA(16, 8)
+VPX_VARIANCE_WDXHT_MSA(16, 16)
+VPX_VARIANCE_WDXHT_MSA(16, 32)
+
+VPX_VARIANCE_WDXHT_MSA(32, 16)
+VPX_VARIANCE_WDXHT_MSA(32, 32)
+
+uint32_t vpx_variance32x64_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ int32_t diff;
+
+ *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+ return VARIANCE_32Wx64H(*sse, diff);
+}
+
+uint32_t vpx_variance64x32_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ int32_t diff;
+
+ *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
+
+ return VARIANCE_64Wx32H(*sse, diff);
+}
+
+uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ int32_t diff;
+
+ *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+ return VARIANCE_64Wx64H(*sse, diff);
+}
+
+uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
+ *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
+
+ return *sse;
+}
+
+uint32_t vpx_mse8x16_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
+
+ return *sse;
+}
+
+uint32_t vpx_mse16x8_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
+
+ return *sse;
+}
+
+uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
+
+ return *sse;
+}
+
+void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+ int32_t *sum) {
+ *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
+}
+
+void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+ int32_t *sum) {
+ *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
+}
+
+uint32_t vpx_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
new file mode 100644
index 0000000000..5b5a1cbc3a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -0,0 +1,716 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ uint32_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 dst0 = { 0 }, res;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 filt, res0, res1;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, res0, res1);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ SRARI_H2_SH(res0, res1, FILTER_BITS);
+ SAT_SH2_SH(res0, res1, 7);
+ res = PCKEV_XORI128_UB(res0, res1);
+ res = (v16u8)__msa_aver_u_b(res, dst0);
+ ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ uint32_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
+ v16u8 dst0 = { 0 }, dst1 = { 0 };
+ v8i16 filt, vec0, vec1, vec2, vec3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, vec0, vec1);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, vec2, vec3);
+ SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
+ PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
+ res3);
+ ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+ XORI_B2_128_UB(res0, res2);
+ AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
+ ST4x8_UB(res0, res2, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ int32_t loop_cnt;
+ int64_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 };
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ int32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
+ v8i16 filt, out0, out1, out2, out3;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height >> 1; loop_cnt--;) {
+ LD_SB2(src, src_stride, src0, src2);
+ LD_SB2(src + 8, src_stride, src1, src3);
+ src += (2 * src_stride);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
+ VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
+ VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+ vec14);
+ VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+ vec15);
+ DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+ vec9, vec10, vec11);
+ DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
+ vec2, vec3);
+ DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+ vec9, vec10, vec11);
+ ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+ out2, out3);
+ LD_UB2(dst, dst_stride, dst0, dst1);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
+ dst += dst_stride;
+ PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
+ VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
+ VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+ vec14);
+ VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+ vec15);
+ DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+ vec9, vec10, vec11);
+ DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
+ vec2, vec3);
+ DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+ vec9, vec10, vec11);
+ ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ LD_UB2(dst, 16, dst1, dst2);
+ PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
+ PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt, cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height; loop_cnt--;) {
+ for (cnt = 0; cnt < 2; ++cnt) {
+ src0 = LD_SB(&src[cnt << 5]);
+ src2 = LD_SB(&src[16 + (cnt << 5)]);
+ src3 = LD_SB(&src[24 + (cnt << 5)]);
+ src1 = __msa_sldi_b(src2, src0, 8);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+ vec12);
+ VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+ vec13);
+ VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+ vec14);
+ VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+ vec15);
+ DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+ vec1, vec2, vec3);
+ DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+ vec9, vec10, vec11);
+ DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
+ vec1, vec2, vec3);
+ DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+ vec9, vec10, vec11);
+ ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
+ PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
+ PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ uint32_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, dst0 = { 0 }, vec0, vec1, res;
+ v8u16 vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+ SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+ res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ res = (v16u8)__msa_aver_u_b(res, dst0);
+ ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ uint32_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+ v16u8 dst0 = { 0 }, dst1 = { 0 };
+ v8u16 vec4, vec5, vec6, vec7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+ vec6, vec7);
+ SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+ PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+ res3);
+ ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+ AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
+ ST4x8_UB(res0, res2, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ int64_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
+ v8u16 vec0, vec1, vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ int64_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
+ v8u16 vec0, vec1, vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ if (16 == height) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter, height);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+ res2, res3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+ res6, res7);
+ SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+ SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+ dst += dst_stride;
+
+ for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+ res2, res3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+ res6, res7);
+ SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+ SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+ src4 = LD_SB(src);
+ src6 = LD_SB(src + 16);
+ src7 = LD_SB(src + 24);
+ src5 = __msa_sldi_b(src6, src4, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+ res2, res3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+ res6, res7);
+ SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+ SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+ LD_UB2(dst, 16, dst0, dst1);
+ PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+ PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
+ dst += dst_stride;
+ LD_UB2(dst, 16, dst2, dst3);
+ PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+ PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ LD_SB4(src, 16, src0, src2, src4, src6);
+ src7 = LD_SB(src + 56);
+ SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
+ PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
+ PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
+ PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
+ PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ int8_t cnt, filt_hor[8];
+
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_x) == 2) {
+ switch (w) {
+ case 4:
+ common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 8:
+ common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 16:
+ common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 32:
+ common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 64:
+ common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 8:
+ common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 16:
+ common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 32:
+ common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 64:
+ common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ default:
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
new file mode 100644
index 0000000000..ba816192a1
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt;
+ uint32_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+ vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
+ vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ SRARI_H2_SH(res0, res1, FILTER_BITS);
+ SAT_SH2_SH(res0, res1, 7);
+ res = PCKEV_XORI128_UB(res0, res1);
+ res = (v16u8)__msa_aver_u_b(res, dst0);
+ ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out5 = hz_out9;
+ vec0 = vec2;
+ vec1 = vec3;
+ vec2 = vec4;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt;
+ uint64_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+ ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+ ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+
+ hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+ tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
+ tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out6 = hz_out10;
+ out0 = out2;
+ out1 = out3;
+ out2 = out8;
+ out4 = out6;
+ out5 = out7;
+ out6 = out9;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 8; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ uint32_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v16u8 dst0 = { 0 }, out;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ filt = LD_UH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ out = __msa_aver_u_b(out, dst0);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ uint32_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+ v16u8 dst0 = { 0 }, dst1 = { 0 };
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ src8 = LD_SB(src);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+ hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+ hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+ SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+ hz_out3, hz_out5, 8);
+ hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
+ tmp1, tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
+ AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+ ST4x8_UB(res0, res1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ } else if (8 == height) {
+ common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ uint64_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3;
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt;
+ uint64_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 };
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_SB(src);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ } else {
+ common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
+ src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
+ dst += dst_stride;
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_hor[8], filt_ver[8];
+
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_x) == 2 &&
+ vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], h);
+ break;
+ case 8:
+ common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], h);
+ break;
+ case 16:
+ common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 32:
+ common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 64:
+ common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else if (vpx_get_filter_taps(filter_x) == 2 ||
+ vpx_get_filter_taps(filter_y) == 2) {
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ } else {
+ switch (w) {
+ case 4:
+ common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 8:
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 16:
+ common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 32:
+ common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 64:
+ common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
new file mode 100644
index 0000000000..e6a790dfc6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -0,0 +1,684 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ uint32_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 dst0 = { 0 }, out;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+ v16i8 src10998, filt0, filt1, filt2, filt3;
+ v8i16 filt, out10, out32;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+ src4332, src6554);
+ XORI_B3_128_SB(src2110, src4332, src6554);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+ XORI_B2_128_SB(src8776, src10998);
+ out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+ filt1, filt2, filt3);
+ out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+ filt1, filt2, filt3);
+ SRARI_H2_SH(out10, out32, FILTER_BITS);
+ SAT_SH2_SH(out10, out32, 7);
+ out = PCKEV_XORI128_UB(out10, out32);
+ out = __msa_aver_u_b(out, dst0);
+
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src2110 = src6554;
+ src4332 = src8776;
+ src6554 = src10998;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ uint64_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 dst0 = { 0 }, dst1 = { 0 };
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+ v8i16 filt, out0, out1, out2, out3;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1,
+ filt2, filt3);
+ out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1,
+ filt2, filt3);
+ out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1,
+ filt2, filt3);
+ out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
+ dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height, int32_t width) {
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt, cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+ v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src_tmp += (7 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+ src54_l, src21_l);
+ ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+ src_tmp += (4 * src_stride);
+
+ LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+ src87_l, src98_l, src109_l);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+ filt1, filt2, filt3);
+ out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+ filt1, filt2, filt3);
+ out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+ filt1, filt2, filt3);
+ out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+ PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+ out3_r, tmp0, tmp1, tmp2, tmp3);
+ XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+ AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
+ dst2, dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
+ dst_tmp += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 64);
+}
+
+static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ uint32_t tp0, tp1, tp2, tp3;
+ v16i8 src0, src1, src2, src3, src4;
+ v16u8 dst0 = { 0 }, out, filt0, src2110, src4332;
+ v16i8 src10_r, src32_r, src21_r, src43_r;
+ v8i16 filt;
+ v8u16 tmp0, tmp1;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ src4 = LD_SB(src);
+ src += src_stride;
+
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ out = __msa_aver_u_b(out, dst0);
+
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ uint32_t tp0, tp1, tp2, tp3;
+ v16u8 dst0 = { 0 }, dst1 = { 0 };
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+ v16u8 src2110, src4332, src6554, src8776, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ src8 = LD_SB(src);
+
+ LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+ src76_r, src87_r);
+ ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+ src76_r, src2110, src4332, src6554, src8776);
+ DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+ AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
+ ST4x8_UB(src2110, src4332, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ int64_t tp0, tp1, tp2, tp3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+ ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_msa(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ int64_t tp0, tp1, tp2, tp3;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+ src += (8 * src_stride);
+ LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst2);
+ INSERT_D2_UB(tp2, tp3, dst3);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
+ vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter, height);
+ }
+}
+
+static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+ /* rearranging filter_y */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+ dst += dst_stride;
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+ /* rearranging filter_y */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_UB2(src, 16, src0, src5);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+ LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+ LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
+ src += (4 * src_stride);
+
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
+
+ ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+ ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+ ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src4;
+ src5 = src9;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5;
+ v16u8 src6, src7, src8, src9, src10, src11, filt0;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8u16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_UB4(src, 16, src0, src3, src6, src9);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LD_UB2(src, src_stride, src1, src2);
+ LD_UB2(dst, dst_stride, dst0, dst1);
+ LD_UB2(src + 16, src_stride, src4, src5);
+ LD_UB2(dst + 16, dst_stride, dst2, dst3);
+ LD_UB2(src + 32, src_stride, src7, src8);
+ LD_UB2(dst + 32, dst_stride, dst4, dst5);
+ LD_UB2(src + 48, src_stride, src10, src11);
+ LD_UB2(dst + 48, dst_stride, dst6, dst7);
+ src += (2 * src_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+ ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+ ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+ ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
+
+ ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+ ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
+ dst += (2 * dst_stride);
+
+ src0 = src2;
+ src3 = src5;
+ src6 = src8;
+ src9 = src11;
+ }
+}
+
+void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_ver[8];
+
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 8:
+ common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 16:
+ common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 32:
+ common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 64:
+ common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 8:
+ common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 16:
+ common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+
+ break;
+ case 32:
+ common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 64:
+ common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
new file mode 100644
index 0000000000..792c0f709c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -0,0 +1,692 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v8i16 filt, out0, out1;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1);
+ SRARI_H2_SH(out0, out1, FILTER_BITS);
+ SAT_SH2_SH(out0, out1, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src0, src1, src2, src3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1, out2,
+ out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ tmp0 = PCKEV_XORI128_UB(out0, out1);
+ tmp1 = PCKEV_XORI128_UB(out2, out3);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ tmp0 = PCKEV_XORI128_UB(out0, out1);
+ tmp1 = PCKEV_XORI128_UB(out2, out3);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LD_SB2(src, src_stride, src0, src2);
+ LD_SB2(src + 8, src_stride, src1, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (2 * src_stride);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 16);
+ dst += dst_stride;
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ int32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 16);
+
+ src0 = LD_SB(src + 32);
+ src2 = LD_SB(src + 48);
+ src3 = LD_SB(src + 56);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst + 32);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 48);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, vec0, vec1, res0, res1;
+ v8u16 vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+ SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+ PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 vec0, vec1, vec2, vec3, filt0;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16i8 res0, res1, res2, res3;
+ v8u16 vec4, vec5, vec6, vec7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+ vec6, vec7);
+ SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+ PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+ res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 filt0;
+ v16i8 src0, src1, src2, src3, mask;
+ v8u16 vec0, vec1, vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+ ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ v16u8 filt0;
+ v16i8 src0, src1, src2, src3, mask, out0, out1;
+ v8u16 vec0, vec1, vec2, vec3, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ if (16 == height) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+ }
+}
+
+static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ loop_cnt = (height >> 2) - 1;
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_ST_SB(out0, out1, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out2, out3, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out4, out5, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out6, out7, dst);
+ dst += dst_stride;
+
+ for (; loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_ST_SB(out0, out1, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out2, out3, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out4, out5, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out6, out7, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ for (loop_cnt = height >> 1; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+ src4 = LD_SB(src);
+ src6 = LD_SB(src + 16);
+ src7 = LD_SB(src + 24);
+ src5 = __msa_sldi_b(src6, src4, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_ST_SB(out0, out1, dst);
+ PCKEV_ST_SB(out2, out3, dst + 16);
+ dst += dst_stride;
+ PCKEV_ST_SB(out4, out5, dst);
+ PCKEV_ST_SB(out6, out7, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src4 = LD_SB(src + 32);
+ src6 = LD_SB(src + 48);
+ src7 = LD_SB(src + 56);
+ SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ PCKEV_ST_SB(out0, out1, dst);
+ PCKEV_ST_SB(out2, out3, dst + 16);
+ PCKEV_ST_SB(out4, out5, dst + 32);
+ PCKEV_ST_SB(out6, out7, dst + 48);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ int8_t cnt, filt_hor[8];
+
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_x) == 2) {
+ switch (w) {
+ case 4:
+ common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 8:
+ common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 16:
+ common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 32:
+ common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 64:
+ common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ default:
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 8:
+ common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 16:
+ common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 32:
+ common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 64:
+ common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ default:
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c
new file mode 100644
index 0000000000..cb7bca5589
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c
@@ -0,0 +1,716 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vpx_ports/mem.h"
+
+#define GET_DATA_H_MMI \
+ "pmaddhw %[ftmp4], %[ftmp4], %[filter1] \n\t" \
+ "pmaddhw %[ftmp5], %[ftmp5], %[filter2] \n\t" \
+ "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
+ "punpckhwd %[ftmp5], %[ftmp4], %[ftmp0] \n\t" \
+ "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp6], %[filter1] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp7], %[filter2] \n\t" \
+ "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \
+ "punpckhwd %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
+ "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \
+ "punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \
+ "pmaddhw %[ftmp8], %[ftmp8], %[filter1] \n\t" \
+ "pmaddhw %[ftmp9], %[ftmp9], %[filter2] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
+ "punpckhwd %[ftmp9], %[ftmp8], %[ftmp0] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
+ "pmaddhw %[ftmp10], %[ftmp10], %[filter1] \n\t" \
+ "pmaddhw %[ftmp11], %[ftmp11], %[filter2] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
+ "punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
+ "punpcklwd %[srch], %[ftmp8], %[ftmp10] \n\t"
+
+#define GET_DATA_V_MMI \
+ "punpcklhw %[srcl], %[ftmp4], %[ftmp5] \n\t" \
+ "pmaddhw %[srcl], %[srcl], %[filter10] \n\t" \
+ "punpcklhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \
+ "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \
+ "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \
+ "punpcklhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \
+ "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \
+ "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \
+ "punpcklhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \
+ "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \
+ "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \
+ "punpckhhw %[srch], %[ftmp4], %[ftmp5] \n\t" \
+ "pmaddhw %[srch], %[srch], %[filter10] \n\t" \
+ "punpckhhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \
+ "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \
+ "paddw %[srch], %[srch], %[ftmp12] \n\t" \
+ "punpckhhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \
+ "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \
+ "paddw %[srch], %[srch], %[ftmp12] \n\t" \
+ "punpckhhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \
+ "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \
+ "paddw %[srch], %[srch], %[ftmp12] \n\t"
+
+/* clang-format off */
+#define ROUND_POWER_OF_TWO_MMI \
+ /* Add para[0] */ \
+ "lw %[tmp0], 0x00(%[para]) \n\t" \
+ MMI_MTC1(%[tmp0], %[ftmp6]) \
+ "punpcklwd %[ftmp6], %[ftmp6], %[ftmp6] \n\t" \
+ "paddw %[srcl], %[srcl], %[ftmp6] \n\t" \
+ "paddw %[srch], %[srch], %[ftmp6] \n\t" \
+ /* Arithmetic right shift para[1] bits */ \
+ "lw %[tmp0], 0x04(%[para]) \n\t" \
+ MMI_MTC1(%[tmp0], %[ftmp5]) \
+ "psraw %[srcl], %[srcl], %[ftmp5] \n\t" \
+ "psraw %[srch], %[srch], %[ftmp5] \n\t"
+/* clang-format on */
+
+#define CLIP_PIXEL_MMI \
+ /* Staturated operation */ \
+ "packsswh %[srcl], %[srcl], %[srch] \n\t" \
+ "packushb %[ftmp12], %[srcl], %[ftmp0] \n\t"
+
+static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int32_t w, int32_t h) {
+ const int16_t *filter_x = filter[x0_q4];
+ double ftmp[14];
+ uint32_t tmp[2];
+ uint32_t para[5];
+ para[0] = (1 << ((FILTER_BITS)-1));
+ para[1] = FILTER_BITS;
+ src -= SUBPEL_TAPS / 2 - 1;
+ src_stride -= w;
+ dst_stride -= w;
+ (void)x_step_q4;
+
+ /* clang-format off */
+ __asm__ volatile(
+ "move %[tmp1], %[width] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gsldlc1 %[filter1], 0x03(%[filter]) \n\t"
+ "gsldrc1 %[filter1], 0x00(%[filter]) \n\t"
+ "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t"
+ "gsldrc1 %[filter2], 0x08(%[filter]) \n\t"
+ "1: \n\t"
+ /* Get 8 data per row */
+ "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t"
+ "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t"
+ "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t"
+ "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t"
+ "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t"
+ "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t"
+ "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
+ MMI_ADDIU(%[width], %[width], -0x04)
+ /* Get raw data */
+ GET_DATA_H_MMI
+ ROUND_POWER_OF_TWO_MMI
+ CLIP_PIXEL_MMI
+ "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
+ MMI_ADDIU(%[dst], %[dst], 0x04)
+ MMI_ADDIU(%[src], %[src], 0x04)
+ /* Loop count */
+ "bnez %[width], 1b \n\t"
+ "move %[width], %[tmp1] \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[dst], %[dst], %[dst_stride])
+ MMI_ADDIU(%[height], %[height], -0x01)
+ "bnez %[height], 1b \n\t"
+ : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
+ [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]),
+ [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]),
+ [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]),
+ [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]),
+ [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]),
+ [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [src]"+&r"(src), [width]"+&r"(w),
+ [dst]"+&r"(dst), [height]"+&r"(h)
+ : [filter]"r"(filter_x), [para]"r"(para),
+ [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int y0_q4,
+ int y_step_q4, int32_t w, int32_t h) {
+ const int16_t *filter_y = filter[y0_q4];
+ double ftmp[16];
+ uint32_t tmp[1];
+ uint32_t para[2];
+ ptrdiff_t addr = src_stride;
+ para[0] = (1 << ((FILTER_BITS)-1));
+ para[1] = FILTER_BITS;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ src_stride -= w;
+ dst_stride -= w;
+ (void)y_step_q4;
+
+ __asm__ volatile(
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t"
+ "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t"
+ "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t"
+ "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t"
+ "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t"
+ "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t"
+ "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ /* Get 8 data per column */
+ "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t"
+ MMI_ADDU(%[tmp0], %[src], %[addr])
+ "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t"
+ "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
+ MMI_ADDIU(%[width], %[width], -0x04)
+ /* Get raw data */
+ GET_DATA_V_MMI
+ ROUND_POWER_OF_TWO_MMI
+ CLIP_PIXEL_MMI
+ "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
+ MMI_ADDIU(%[dst], %[dst], 0x04)
+ MMI_ADDIU(%[src], %[src], 0x04)
+ /* Loop count */
+ "bnez %[width], 1b \n\t"
+ MMI_SUBU(%[width], %[addr], %[src_stride])
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[dst], %[dst], %[dst_stride])
+ MMI_ADDIU(%[height], %[height], -0x01)
+ "bnez %[height], 1b \n\t"
+ : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
+ [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
+ [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
+ [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]),
+ [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]),
+ [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]),
+ [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]),
+ [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]),
+ [src]"+&r"(src), [dst]"+&r"(dst),
+ [width]"+&r"(w), [height]"+&r"(h),
+ [tmp0]"=&r"(tmp[0])
+ : [filter]"r"(filter_y), [para]"r"(para),
+ [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride),
+ [addr]"r"((mips_reg)addr)
+ : "memory"
+ );
+}
+
+static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int32_t w, int32_t h) {
+ const int16_t *filter_x = filter[x0_q4];
+ double ftmp[14];
+ uint32_t tmp[2];
+ uint32_t para[2];
+ para[0] = (1 << ((FILTER_BITS)-1));
+ para[1] = FILTER_BITS;
+ src -= SUBPEL_TAPS / 2 - 1;
+ src_stride -= w;
+ dst_stride -= w;
+ (void)x_step_q4;
+
+ __asm__ volatile(
+ "move %[tmp1], %[width] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gsldlc1 %[filter1], 0x03(%[filter]) \n\t"
+ "gsldrc1 %[filter1], 0x00(%[filter]) \n\t"
+ "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t"
+ "gsldrc1 %[filter2], 0x08(%[filter]) \n\t"
+ "1: \n\t"
+ /* Get 8 data per row */
+ "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t"
+ "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t"
+ "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t"
+ "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t"
+ "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t"
+ "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t"
+ "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
+ MMI_ADDIU(%[width], %[width], -0x04)
+ /* Get raw data */
+ GET_DATA_H_MMI
+ ROUND_POWER_OF_TWO_MMI
+ CLIP_PIXEL_MMI
+ "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
+ "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t"
+ "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
+ "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t"
+ "li %[tmp0], 0x10001 \n\t"
+ MMI_MTC1(%[tmp0], %[ftmp5])
+ "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
+ "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
+ "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
+ "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
+ MMI_ADDIU(%[dst], %[dst], 0x04)
+ MMI_ADDIU(%[src], %[src], 0x04)
+ /* Loop count */
+ "bnez %[width], 1b \n\t"
+ "move %[width], %[tmp1] \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[dst], %[dst], %[dst_stride])
+ MMI_ADDIU(%[height], %[height], -0x01)
+ "bnez %[height], 1b \n\t"
+ : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
+ [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]),
+ [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]),
+ [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]),
+ [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]),
+ [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]),
+ [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [src]"+&r"(src), [width]"+&r"(w),
+ [dst]"+&r"(dst), [height]"+&r"(h)
+ : [filter]"r"(filter_x), [para]"r"(para),
+ [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+}
+
+static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int y0_q4,
+ int y_step_q4, int32_t w, int32_t h) {
+ const int16_t *filter_y = filter[y0_q4];
+ double ftmp[16];
+ uint32_t tmp[1];
+ uint32_t para[2];
+ ptrdiff_t addr = src_stride;
+ para[0] = (1 << ((FILTER_BITS)-1));
+ para[1] = FILTER_BITS;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ src_stride -= w;
+ dst_stride -= w;
+ (void)y_step_q4;
+
+ __asm__ volatile(
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t"
+ "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t"
+ "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t"
+ "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t"
+ "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t"
+ "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t"
+ "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ /* Get 8 data per column */
+ "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t"
+ MMI_ADDU(%[tmp0], %[src], %[addr])
+ "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t"
+ MMI_ADDU(%[tmp0], %[tmp0], %[addr])
+ "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t"
+ "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
+ MMI_ADDIU(%[width], %[width], -0x04)
+ /* Get raw data */
+ GET_DATA_V_MMI
+ ROUND_POWER_OF_TWO_MMI
+ CLIP_PIXEL_MMI
+ "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
+ "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t"
+ "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
+ "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t"
+ "li %[tmp0], 0x10001 \n\t"
+ MMI_MTC1(%[tmp0], %[ftmp5])
+ "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
+ "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
+ "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
+ "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
+ MMI_ADDIU(%[dst], %[dst], 0x04)
+ MMI_ADDIU(%[src], %[src], 0x04)
+ /* Loop count */
+ "bnez %[width], 1b \n\t"
+ MMI_SUBU(%[width], %[addr], %[src_stride])
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[dst], %[dst], %[dst_stride])
+ MMI_ADDIU(%[height], %[height], -0x01)
+ "bnez %[height], 1b \n\t"
+ : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
+ [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
+ [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
+ [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]),
+ [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]),
+ [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]),
+ [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]),
+ [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]),
+ [src]"+&r"(src), [dst]"+&r"(dst),
+ [width]"+&r"(w), [height]"+&r"(h),
+ [tmp0]"=&r"(tmp[0])
+ : [filter]"r"(filter_y), [para]"r"(para),
+ [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride),
+ [addr]"r"((mips_reg)addr)
+ : "memory"
+ );
+}
+
+void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ int x, y;
+
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ if (w & 0x03) {
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else {
+ double ftmp[4];
+ uint32_t tmp[2];
+ src_stride -= w;
+ dst_stride -= w;
+
+ __asm__ volatile(
+ "move %[tmp1], %[width] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "li %[tmp0], 0x10001 \n\t"
+ MMI_MTC1(%[tmp0], %[ftmp3])
+ "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[dst]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[dst]) \n\t"
+ "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ "swc1 %[ftmp1], 0x00(%[dst]) \n\t"
+ MMI_ADDIU(%[width], %[width], -0x04)
+ MMI_ADDIU(%[dst], %[dst], 0x04)
+ MMI_ADDIU(%[src], %[src], 0x04)
+ "bnez %[width], 1b \n\t"
+ "move %[width], %[tmp1] \n\t"
+ MMI_ADDU(%[dst], %[dst], %[dst_stride])
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDIU(%[height], %[height], -0x01)
+ "bnez %[height], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [src]"+&r"(src), [dst]"+&r"(dst),
+ [width]"+&r"(w), [height]"+&r"(h)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+ }
+}
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+ dst[y * dst_stride] +
+ clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
+ 1);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = ROUND_POWER_OF_TWO(
+ dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int32_t x_step_q4, int y0_q4,
+ int32_t y_step_q4, int32_t w, int32_t h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ uint8_t temp[64 * 135];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if (w & 0x03) {
+ convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
+ 64, filter, x0_q4, x_step_q4, w, intermediate_height);
+ convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+ filter, y0_q4, y_step_q4, w, h);
+ } else {
+ convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+ temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+ filter, y0_q4, y_step_q4, w, h);
+ }
+}
+
+void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+ int32_t w, int32_t h) {
+ (void)y0_q4;
+ (void)y_step_q4;
+ if (w & 0x03)
+ convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ w, h);
+ else
+ convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, w, h);
+}
+
+void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ (void)x0_q4;
+ (void)x_step_q4;
+ if (w & 0x03)
+ convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+ h);
+ else
+ convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
+ y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ (void)y0_q4;
+ (void)y_step_q4;
+ if (w & 0x03)
+ convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, w, h);
+ else
+ convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ (void)x0_q4;
+ (void)x_step_q4;
+ if (w & 0x03)
+ convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
+ y_step_q4, w, h);
+ else
+ convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
+ y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+ int32_t w, int32_t h) {
+ // Fixed size intermediate buffer places limits on parameters.
+ DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+ assert(w <= 64);
+ assert(h <= 64);
+
+ vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, h);
+ vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
new file mode 100644
index 0000000000..c942167587
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -0,0 +1,1227 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+ out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+ out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
+ out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
+ SAT_SH2_SH(tmp0, tmp1, 7);
+ out = PCKEV_XORI128_UB(tmp0, tmp1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out5 = hz_out9;
+ out0 = out2;
+ out1 = out3;
+ out2 = out4;
+ }
+}
+
+static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+ ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+ ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src7, src8, src9, src10);
+
+ hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+ tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
+ tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+ vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+ ST8x4_UB(vec0, vec1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out6 = hz_out10;
+ out0 = out2;
+ out1 = out3;
+ out2 = out8;
+ out4 = out6;
+ out5 = out7;
+ out6 = out9;
+ }
+}
+
+static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 8; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ filt = LD_UH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ v16i8 res0, res1, res2, res3;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ filt = LD_UH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ src8 = LD_SB(src);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+ hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+ hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+ SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+ hz_out3, hz_out5, 8);
+ hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
+ vec5, vec6, vec7);
+ SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+ PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+ res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else if (8 == height) {
+ common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+ v16u8 filt_hz, filt_vt, vec0;
+ v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_SB(src);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
+ PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
+ PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else {
+ common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ }
+}
+
+static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int32_t x_step_q4, int y0_q4,
+ int32_t y_step_q4, int32_t w, int32_t h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_hor[8], filt_ver[8];
+
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_x) == 2 &&
+ vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 8:
+ common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 16:
+ common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 32:
+ common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 64:
+ common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ default:
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else if (vpx_get_filter_taps(filter_x) == 2 ||
+ vpx_get_filter_taps(filter_y) == 2) {
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h);
+ } else {
+ switch (w) {
+ case 4:
+ common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 8:
+ common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 16:
+ common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 32:
+ common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 64:
+ common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ default:
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
+
+static void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *x_filter) {
+ uint64_t srcd0, srcd1, srcd2, srcd3;
+ uint32_t res;
+ v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
+ v16i8 out0, out1;
+ v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 };
+ v16i8 shf2 = shf1 + 2;
+ v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
+ v16i8 filt_shf1 = filt_shf0 + 2;
+ v16i8 filt_shf2 = filt_shf0 + 4;
+ v16i8 filt_shf3 = filt_shf0 + 6;
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3;
+
+ LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src0);
+ INSERT_D2_UB(srcd2, srcd3, src1);
+ VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
+ XORI_B2_128_SB(out0, out1);
+ UNPCK_SB_SH(out0, src0_h, src1_h);
+ UNPCK_SB_SH(out1, src2_h, src3_h);
+
+ filt = LD_SH(x_filter);
+ VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
+ VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
+
+ src0_h *= filt0;
+ src0_h += src1_h * filt1;
+ src0_h += src2_h * filt2;
+ src0_h += src3_h * filt3;
+
+ src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
+
+ src0_h = __msa_adds_s_h(src0_h, src1_h);
+ src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+ src0_h = __msa_sat_s_h(src0_h, 7);
+ dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+ res = __msa_copy_u_w((v4i32)dst0, 0);
+ SW(res, dst);
+}
+
+static void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *x_filter) {
+ uint64_t srcd0, srcd1, srcd2, srcd3;
+ v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+ v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
+ v16i8 out0, out1, out2, out3;
+ v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+ v16i8 shf2 = shf1 + 4;
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+ v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+ LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src0);
+ INSERT_D2_UB(srcd2, srcd3, src1);
+ LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src2);
+ INSERT_D2_UB(srcd2, srcd3, src3);
+
+ filt = LD_SH(x_filter);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+ SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+ // transpose
+ VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
+ VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
+ ILVRL_W2_SB(tmp2, tmp0, out0, out1);
+ ILVRL_W2_SB(tmp3, tmp1, out2, out3);
+
+ XORI_B4_128_SB(out0, out1, out2, out3);
+ UNPCK_SB_SH(out0, src0_h, src1_h);
+ UNPCK_SB_SH(out1, src2_h, src3_h);
+ UNPCK_SB_SH(out2, src4_h, src5_h);
+ UNPCK_SB_SH(out3, src6_h, src7_h);
+
+ src0_h *= filt0;
+ src4_h *= filt4;
+ src0_h += src1_h * filt1;
+ src4_h += src5_h * filt5;
+ src0_h += src2_h * filt2;
+ src4_h += src6_h * filt6;
+ src0_h += src3_h * filt3;
+ src4_h += src7_h * filt7;
+
+ src0_h = __msa_adds_s_h(src0_h, src4_h);
+ src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+ src0_h = __msa_sat_s_h(src0_h, 7);
+ dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+ ST8x1_UB(dst0, dst);
+}
+
+static void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *x_filter) {
+ uint64_t srcd0, srcd1, srcd2, srcd3;
+ v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+ v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 };
+ v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
+ v16i8 out0, out1, out2, out3, out4, out5, out6, out7;
+ v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+ v16i8 shf2 = shf1 + 4;
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+ v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+ v8i16 dst0_h, dst1_h, dst2_h, dst3_h;
+
+ LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src0);
+ INSERT_D2_UB(srcd2, srcd3, src1);
+ LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src2);
+ INSERT_D2_UB(srcd2, srcd3, src3);
+ LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src4);
+ INSERT_D2_UB(srcd2, srcd3, src5);
+ LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_UB(srcd0, srcd1, src6);
+ INSERT_D2_UB(srcd2, srcd3, src7);
+
+ filt = LD_SH(x_filter);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+ SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+ // transpose
+ VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
+ VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
+ ILVRL_W2_SB(tmp2, tmp0, out0, out1);
+ ILVRL_W2_SB(tmp3, tmp1, out2, out3);
+ XORI_B4_128_SB(out0, out1, out2, out3);
+
+ UNPCK_SB_SH(out0, src0_h, src1_h);
+ UNPCK_SB_SH(out1, src2_h, src3_h);
+ UNPCK_SB_SH(out2, src4_h, src5_h);
+ UNPCK_SB_SH(out3, src6_h, src7_h);
+
+ VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1);
+ VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3);
+ ILVRL_W2_SB(tmp2, tmp0, out4, out5);
+ ILVRL_W2_SB(tmp3, tmp1, out6, out7);
+ XORI_B4_128_SB(out4, out5, out6, out7);
+
+ dst0_h = src0_h * filt0;
+ dst1_h = src4_h * filt4;
+ dst0_h += src1_h * filt1;
+ dst1_h += src5_h * filt5;
+ dst0_h += src2_h * filt2;
+ dst1_h += src6_h * filt6;
+ dst0_h += src3_h * filt3;
+ dst1_h += src7_h * filt7;
+
+ UNPCK_SB_SH(out4, src0_h, src1_h);
+ UNPCK_SB_SH(out5, src2_h, src3_h);
+ UNPCK_SB_SH(out6, src4_h, src5_h);
+ UNPCK_SB_SH(out7, src6_h, src7_h);
+
+ dst2_h = src0_h * filt0;
+ dst3_h = src4_h * filt4;
+ dst2_h += src1_h * filt1;
+ dst3_h += src5_h * filt5;
+ dst2_h += src2_h * filt2;
+ dst3_h += src6_h * filt6;
+ dst2_h += src3_h * filt3;
+ dst3_h += src7_h * filt7;
+
+ ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h);
+ SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS);
+ SAT_SH2_SH(dst0_h, dst2_h, 7);
+ dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h);
+ ST_UB(dst0, dst);
+}
+
+static void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst,
+ ptrdiff_t dst_stride) {
+ v16u8 in0;
+ v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+ in0 = LD_UB(src);
+ out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0);
+ ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst,
+ ptrdiff_t dst_stride) {
+ v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
+ v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+ v16i8 shf2 = shf1 + 4;
+
+ LD_UB4(src, 16, in0, in1, in2, in3);
+ VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1);
+ VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3);
+ ILVRL_W2_UB(tmp2, tmp0, out0, out1);
+ ILVRL_W2_UB(tmp3, tmp1, out2, out3);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst,
+ ptrdiff_t dst_stride) {
+ v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12;
+ v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8;
+ v16u8 out9, out10, out11, out12, out13, out14, out15;
+
+ LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, out0, out1, out2, out3,
+ out4, out5, out6, out7);
+ ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride);
+ dst += 8 * dst_stride;
+
+ SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8);
+ SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8);
+ SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8);
+ SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8);
+
+ TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, out8, out9, out10, out11,
+ out12, out13, out14, out15);
+ ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int y, z, i;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; y += 4) {
+ int x_q4 = x0_q4;
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter);
+ } else {
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+
+ x_q4 += x_step_q4;
+ }
+
+ transpose4x4_to_dst(temp, dst, dst_stride);
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ }
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int y, z, i;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = h + (8 - (h & 0x7));
+
+ do {
+ int x_q4 = x0_q4;
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter);
+ } else {
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[3 + i * src_stride];
+ }
+ }
+
+ x_q4 += x_step_q4;
+ }
+
+ transpose8x8_to_dst(temp, dst, dst_stride);
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]);
+ int x, y, z, i;
+
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 16x16 areas. The intermediate height is not always
+ // a multiple of 16, so force it to be a multiple of 8 here.
+ y = h + (16 - (h & 0xF));
+
+ do {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 16) {
+ for (z = 0; z < 16; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter);
+ } else {
+ for (i = 0; i < 16; ++i) {
+ temp[z * 16 + i] = src_x[3 + i * src_stride];
+ }
+ }
+
+ x_q4 += x_step_q4;
+ }
+
+ transpose16x16_to_dst(temp, dst + x, dst_stride);
+ }
+
+ src += src_stride * 16;
+ dst += dst_stride * 16;
+ } while (y -= 16);
+}
+
+static void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *y_filter) {
+ uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7;
+ uint32_t res;
+ v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
+ v16i8 out0, out1;
+ v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 };
+ v16i8 shf2 = shf1 + 8;
+ v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
+ v16i8 filt_shf1 = filt_shf0 + 2;
+ v16i8 filt_shf2 = filt_shf0 + 4;
+ v16i8 filt_shf3 = filt_shf0 + 6;
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h;
+ v8i16 filt0, filt1, filt2, filt3;
+
+ LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3);
+ LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7);
+ INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0);
+ INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1);
+ VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
+ XORI_B2_128_SB(out0, out1);
+ UNPCK_SB_SH(out0, src0_h, src1_h);
+ UNPCK_SB_SH(out1, src2_h, src3_h);
+
+ filt = LD_SH(y_filter);
+ VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
+ VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
+
+ src0_h *= filt0;
+ src0_h += src1_h * filt1;
+ src0_h += src2_h * filt2;
+ src0_h += src3_h * filt3;
+
+ src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
+
+ src0_h = __msa_adds_s_h(src0_h, src1_h);
+ src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+ src0_h = __msa_sat_s_h(src0_h, 7);
+ dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+ res = __msa_copy_u_w((v4i32)dst0, 0);
+ SW(res, dst);
+}
+
+static void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *y_filter) {
+ uint64_t srcd0, srcd1, srcd2, srcd3;
+ v16u8 dst0;
+ v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+ v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+ LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_SB(srcd0, srcd1, src0);
+ INSERT_D2_SB(srcd2, srcd3, src1);
+ LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+ INSERT_D2_SB(srcd0, srcd1, src2);
+ INSERT_D2_SB(srcd2, srcd3, src3);
+
+ filt = LD_SH(y_filter);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+ SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ UNPCK_SB_SH(src0, src0_h, src1_h);
+ UNPCK_SB_SH(src1, src2_h, src3_h);
+ UNPCK_SB_SH(src2, src4_h, src5_h);
+ UNPCK_SB_SH(src3, src6_h, src7_h);
+
+ src0_h *= filt0;
+ src4_h *= filt4;
+ src0_h += src1_h * filt1;
+ src4_h += src5_h * filt5;
+ src0_h += src2_h * filt2;
+ src4_h += src6_h * filt6;
+ src0_h += src3_h * filt3;
+ src4_h += src7_h * filt7;
+
+ src0_h = __msa_adds_s_h(src0_h, src4_h);
+ src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+ src0_h = __msa_sat_s_h(src0_h, 7);
+ dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+ ST8x1_UB(dst0, dst);
+}
+
+static void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+ uint8_t *dst, const int16_t *y_filter,
+ int w) {
+ int x;
+ v16u8 dst0;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+ v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h;
+ v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+ filt = LD_SH(y_filter);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+ SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+ for (x = 0; x < w; x += 16) {
+ LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7);
+ src_y += 16;
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ XORI_B4_128_SB(src4, src5, src6, src7);
+ UNPCK_SB_SH(src0, src0_h, src1_h);
+ UNPCK_SB_SH(src1, src2_h, src3_h);
+ UNPCK_SB_SH(src2, src4_h, src5_h);
+ UNPCK_SB_SH(src3, src6_h, src7_h);
+ UNPCK_SB_SH(src4, src8_h, src9_h);
+ UNPCK_SB_SH(src5, src10_h, src11_h);
+ UNPCK_SB_SH(src6, src12_h, src13_h);
+ UNPCK_SB_SH(src7, src14_h, src15_h);
+
+ src0_h *= filt0;
+ src1_h *= filt0;
+ src8_h *= filt4;
+ src9_h *= filt4;
+ src0_h += src2_h * filt1;
+ src1_h += src3_h * filt1;
+ src8_h += src10_h * filt5;
+ src9_h += src11_h * filt5;
+ src0_h += src4_h * filt2;
+ src1_h += src5_h * filt2;
+ src8_h += src12_h * filt6;
+ src9_h += src13_h * filt6;
+ src0_h += src6_h * filt3;
+ src1_h += src7_h * filt3;
+ src8_h += src14_h * filt7;
+ src9_h += src15_h * filt7;
+
+ ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h);
+ SRARI_H2_SH(src0_h, src1_h, FILTER_BITS);
+ SAT_SH2_SH(src0_h, src1_h, 7);
+ dst0 = PCKEV_XORI128_UB(src0_h, src1_h);
+ ST_UB(dst0, dst);
+ dst += 16;
+ }
+}
+
+static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (y = 0; y < h; ++y) {
+ const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ uint32_t srcd = LW(src_y + 3 * src_stride);
+ SW(srcd, dst + y * dst_stride);
+ }
+
+ y_q4 += y_step_q4;
+ }
+}
+
+static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (y = 0; y < h; ++y) {
+ const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ uint64_t srcd = LD(src_y + 3 * src_stride);
+ SD(srcd, dst + y * dst_stride);
+ }
+
+ y_q4 += y_step_q4;
+ }
+}
+
+static void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ int y_q4 = y0_q4;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (y = 0; y < h; ++y) {
+ const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter,
+ w);
+ } else {
+ for (x = 0; x < w; ++x) {
+ dst[x + y * dst_stride] = src_y[x + 3 * src_stride];
+ }
+ }
+
+ y_q4 += y_step_q4;
+ }
+}
+
+void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) {
+ vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ } else {
+ if (w >= 16) {
+ scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4,
+ w, intermediate_height);
+ } else if (w == 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4,
+ intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4,
+ intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, h);
+ } else {
+ scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, h);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
new file mode 100644
index 0000000000..195228689e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -0,0 +1,699 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+ v16i8 src10998, filt0, filt1, filt2, filt3;
+ v16u8 out;
+ v8i16 filt, out10, out32;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+ src4332, src6554);
+ XORI_B3_128_SB(src2110, src4332, src6554);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+ XORI_B2_128_SB(src8776, src10998);
+ out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+ filt1, filt2, filt3);
+ out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+ filt1, filt2, filt3);
+ SRARI_H2_SH(out10, out32, FILTER_BITS);
+ SAT_SH2_SH(out10, out32, 7);
+ out = PCKEV_XORI128_UB(out10, out32);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src2110 = src6554;
+ src4332 = src8776;
+ src6554 = src10998;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+ v16u8 tmp0, tmp1;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+ tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+ v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+ src54_l, src21_l);
+ ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+ src87_l, src98_l, src109_l);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+ filt1, filt2, filt3);
+ out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+ filt1, filt2, filt3);
+ out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+ filt1, filt2, filt3);
+ out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+ PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
+ tmp0, tmp1, tmp2, tmp3);
+ XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+ ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height,
+ int32_t width) {
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt, cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+ v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src_tmp += (7 * src_stride);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+ src54_l, src21_l);
+ ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src_tmp += (4 * src_stride);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+ src87_l, src98_l, src109_l);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+ filt1, filt2, filt3);
+ out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+ filt1, filt2, filt3);
+ out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+ filt1, filt2, filt3);
+ out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+ PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+ out3_r, tmp0, tmp1, tmp2, tmp3);
+ XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+ ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+ dst_tmp += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+ 32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+ 64);
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4;
+ v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+ v16u8 filt0;
+ v8i16 filt;
+ v8u16 tmp0, tmp1;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt0;
+ v8i16 filt;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ src8 = LD_SB(src);
+ src += src_stride;
+
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+ src76_r, src87_r);
+ ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+ src76_r, src2110, src4332, src6554, src8776);
+ DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+ ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+ ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+ v16i8 out0, out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+ ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v16i8 out0, out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+ src += (8 * src_stride);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
+ vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+ dst += dst_stride;
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src5 = LD_UB(src + 16);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+ LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+ src += (4 * src_stride);
+
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
+
+ ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+ ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 16);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+ ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src4;
+ src5 = src9;
+ }
+}
+
+static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_UB4(src, 16, src0, src3, src6, src9);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LD_UB2(src, src_stride, src1, src2);
+ LD_UB2(src + 16, src_stride, src4, src5);
+ LD_UB2(src + 32, src_stride, src7, src8);
+ LD_UB2(src + 48, src_stride, src10, src11);
+ src += (2 * src_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+ ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+ ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ PCKEV_ST_SB(tmp4, tmp5, dst + 16);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+ ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 32);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
+
+ ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+ ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ PCKEV_ST_SB(tmp4, tmp5, dst + 48);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
+ dst += (2 * dst_stride);
+
+ src0 = src2;
+ src3 = src5;
+ src6 = src8;
+ src9 = src11;
+ }
+}
+
+void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_ver[8];
+
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 8; cnt--;) {
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 8:
+ common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 16:
+ common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 32:
+ common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 64:
+ common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 8:
+ common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 16:
+ common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 32:
+ common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 64:
+ common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
new file mode 100644
index 0000000000..ce649935da
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint32_t out0, out1, out2, out3;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ if (0 == (height % 4)) {
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+
+ out0 = __msa_copy_u_w((v4i32)dst0, 0);
+ out1 = __msa_copy_u_w((v4i32)dst1, 0);
+ out2 = __msa_copy_u_w((v4i32)dst2, 0);
+ out3 = __msa_copy_u_w((v4i32)dst3, 0);
+ SW4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == (height % 2)) {
+ for (cnt = (height / 2); cnt--;) {
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+
+ LD_UB2(dst, dst_stride, dst0, dst1);
+
+ AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+ out0 = __msa_copy_u_w((v4i32)dst0, 0);
+ out1 = __msa_copy_u_w((v4i32)dst1, 0);
+ SW(out0, dst);
+ dst += dst_stride;
+ SW(out1, dst);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint64_t out0, out1, out2, out3;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+
+ out0 = __msa_copy_u_d((v2i64)dst0, 0);
+ out1 = __msa_copy_u_d((v2i64)dst1, 0);
+ out2 = __msa_copy_u_d((v2i64)dst2, 0);
+ out3 = __msa_copy_u_d((v2i64)dst3, 0);
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ for (cnt = (height / 8); cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+ dst6, dst7);
+ ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+ dst += (8 * dst_stride);
+ }
+}
+
+static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint8_t *dst_dup = dst;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ for (cnt = (height / 8); cnt--;) {
+ LD_UB4(src, src_stride, src0, src2, src4, src6);
+ LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
+ LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
+ dst_dup += (4 * dst_stride);
+ LD_UB4(src, src_stride, src8, src10, src12, src14);
+ LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
+ src += (4 * src_stride);
+ LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
+ LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
+ dst_dup += (4 * dst_stride);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+ dst6, dst7);
+ AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
+ dst10, dst11);
+ AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
+ dst13, dst14, dst15);
+
+ ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
+ ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
+ ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint8_t *dst_dup = dst;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(src, 16, src4, src5, src6, src7);
+ src += src_stride;
+ LD_UB4(src, 16, src8, src9, src10, src11);
+ src += src_stride;
+ LD_UB4(src, 16, src12, src13, src14, src15);
+ src += src_stride;
+
+ LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
+ dst_dup += dst_stride;
+ LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
+ dst_dup += dst_stride;
+ LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
+ dst_dup += dst_stride;
+ LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
+ dst_dup += dst_stride;
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+ dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+ dst6, dst7);
+ AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
+ dst10, dst11);
+ AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
+ dst13, dst14, dst15);
+
+ ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+ dst += dst_stride;
+ ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
+ dst += dst_stride;
+ ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
+ dst += dst_stride;
+ ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+ int32_t w, int32_t h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ switch (w) {
+ case 4: {
+ avg_width4_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 8: {
+ avg_width8_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ avg_width16_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ avg_width32_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ avg_width64_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ int32_t lp, cnt;
+ for (cnt = h; cnt--;) {
+ for (lp = 0; lp < w; ++lp) {
+ dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
new file mode 100644
index 0000000000..c2ab33a2f4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+ out4 = __msa_copy_u_d((v2i64)src4, 0);
+ out5 = __msa_copy_u_d((v2i64)src5, 0);
+ out6 = __msa_copy_u_d((v2i64)src6, 0);
+ out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ for (cnt = height >> 3; cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+ out4 = __msa_copy_u_d((v2i64)src4, 0);
+ out5 = __msa_copy_u_d((v2i64)src5, 0);
+ out6 = __msa_copy_u_d((v2i64)src6, 0);
+ out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 4) {
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 2) {
+ for (cnt = (height / 2); cnt--;) {
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+
+ SD(out0, dst);
+ dst += dst_stride;
+ SD(out1, dst);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height, int32_t width) {
+ int32_t cnt, loop_cnt;
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6,
+ src7);
+ src_tmp += (8 * src_stride);
+
+ ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp,
+ dst_stride);
+ dst_tmp += (8 * dst_stride);
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+ dst += (8 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+ } else if (0 == height % 4) {
+ for (cnt = (height >> 2); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+ } else if (0 == height % 4) {
+ for (cnt = (height >> 2); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+ int32_t w, int32_t h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ switch (w) {
+ case 4: {
+ uint32_t cnt, tmp;
+ /* 1 word storage */
+ for (cnt = h; cnt--;) {
+ tmp = LW(src);
+ SW(tmp, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ case 8: {
+ copy_width8_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ copy_width16_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ copy_width32_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ copy_width64_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ uint32_t cnt;
+ for (cnt = h; cnt--;) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
new file mode 100644
index 0000000000..a0280c5434
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+#define VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/vpx_filter.h"
+
+extern const uint8_t mc_filt_mask_arr[16 * 3];
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \
+ filt3) \
+ ({ \
+ v8i16 tmp_dpadd_0, tmp_dpadd_1; \
+ \
+ tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
+ tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \
+ tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \
+ tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \
+ tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \
+ \
+ tmp_dpadd_0; \
+ })
+
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \
+ filt_h1, filt_h2, filt_h3) \
+ ({ \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v8i16 hz_out_m; \
+ \
+ VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \
+ vec3_m); \
+ hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \
+ filt_h1, filt_h2, filt_h3); \
+ \
+ hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \
+ hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
+ \
+ hz_out_m; \
+ })
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ mask2, mask3, filt0, filt1, filt2, filt3, \
+ out0, out1) \
+ { \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ \
+ VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
+ DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
+ DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
+ DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
+ DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
+ ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
+ }
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ mask2, mask3, filt0, filt1, filt2, filt3, \
+ out0, out1, out2, out3) \
+ { \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
+ \
+ VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
+ DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
+ res0_m, res1_m, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
+ DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
+ res4_m, res5_m, res6_m, res7_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
+ DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
+ res0_m, res1_m, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
+ DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
+ res4_m, res5_m, res6_m, res7_m); \
+ ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
+ res7_m, out0, out1, out2, out3); \
+ }
+
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
+ { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = PCKEV_XORI128_UB(in1, in0); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
+ ST_UB(tmp_m, (pdst)); \
+ }
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
+ { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
+ ST_UB(tmp_m, (pdst)); \
+ }
+
+#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
+ { \
+ v16u8 tmp0_m, tmp1_m; \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ \
+ PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
+ AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
+ }
+#endif // VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/postproc.h b/media/libvpx/libvpx/vpx_dsp/postproc.h
new file mode 100644
index 0000000000..37f993f814
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/postproc.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_POSTPROC_H_
+#define VPX_VPX_DSP_POSTPROC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Fills a noise buffer with gaussian noise strength determined by sigma.
+int vpx_setup_noise(double sigma, int8_t *noise, int size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // VPX_VPX_DSP_POSTPROC_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h
new file mode 100644
index 0000000000..7ac873f9fc
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+#define VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE int16x8_t load_tran_low(int32_t c, const tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ int32x4_t u = vec_vsx_ld(c, s);
+ int32x4_t v = vec_vsx_ld(c, s + 4);
+ return vec_packs(u, v);
+#else
+ return vec_vsx_ld(c, s);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int16x8_t one = vec_splat_s16(1);
+ const int32x4_t even = vec_mule(v, one);
+ const int32x4_t odd = vec_mulo(v, one);
+ const int32x4_t high = vec_mergeh(even, odd);
+ const int32x4_t low = vec_mergel(even, odd);
+ vec_vsx_st(high, c, s);
+ vec_vsx_st(low, c, s + 4);
+#else
+ vec_vsx_st(v, c, s);
+#endif
+}
+
+#endif // VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c
new file mode 100644
index 0000000000..2129911696
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+extern const int16_t vpx_rv[];
+
+static const uint8x16_t load_merge = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A,
+ 0x0C, 0x0E, 0x18, 0x19, 0x1A, 0x1B,
+ 0x1C, 0x1D, 0x1E, 0x1F };
+
+static const uint8x16_t st8_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B,
+ 0x1C, 0x1D, 0x1E, 0x1F };
+
+static INLINE uint8x16_t apply_filter(uint8x16_t ctx[4], uint8x16_t v,
+ uint8x16_t filter) {
+ const uint8x16_t k1 = vec_avg(ctx[0], ctx[1]);
+ const uint8x16_t k2 = vec_avg(ctx[3], ctx[2]);
+ const uint8x16_t k3 = vec_avg(k1, k2);
+ const uint8x16_t f_a = vec_max(vec_absd(v, ctx[0]), vec_absd(v, ctx[1]));
+ const uint8x16_t f_b = vec_max(vec_absd(v, ctx[2]), vec_absd(v, ctx[3]));
+ const bool8x16_t mask = vec_cmplt(vec_max(f_a, f_b), filter);
+ return vec_sel(v, vec_avg(k3, v), mask);
+}
+
+static INLINE void vert_ctx(uint8x16_t ctx[4], int col, uint8_t *src,
+ int stride) {
+ ctx[0] = vec_vsx_ld(col - 2 * stride, src);
+ ctx[1] = vec_vsx_ld(col - stride, src);
+ ctx[2] = vec_vsx_ld(col + stride, src);
+ ctx[3] = vec_vsx_ld(col + 2 * stride, src);
+}
+
+static INLINE void horz_ctx(uint8x16_t ctx[4], uint8x16_t left_ctx,
+ uint8x16_t v, uint8x16_t right_ctx) {
+ static const uint8x16_t l2_perm = { 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13,
+ 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+ 0x1A, 0x1B, 0x1C, 0x1D };
+
+ static const uint8x16_t l1_perm = { 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
+ 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
+ 0x1B, 0x1C, 0x1D, 0x1E };
+
+ static const uint8x16_t r1_perm = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+ 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C,
+ 0x0D, 0x0E, 0x0F, 0x10 };
+
+ static const uint8x16_t r2_perm = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+ 0x0E, 0x0F, 0x10, 0x11 };
+ ctx[0] = vec_perm(left_ctx, v, l2_perm);
+ ctx[1] = vec_perm(left_ctx, v, l1_perm);
+ ctx[2] = vec_perm(v, right_ctx, r1_perm);
+ ctx[3] = vec_perm(v, right_ctx, r2_perm);
+}
+void vpx_post_proc_down_and_across_mb_row_vsx(unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line, int cols,
+ unsigned char *f, int size) {
+ int row, col;
+ uint8x16_t ctx[4], out, v, left_ctx;
+
+ for (row = 0; row < size; row++) {
+ for (col = 0; col < cols - 8; col += 16) {
+ const uint8x16_t filter = vec_vsx_ld(col, f);
+ v = vec_vsx_ld(col, src_ptr);
+ vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
+ vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
+ }
+
+ if (col != cols) {
+ const uint8x16_t filter = vec_vsx_ld(col, f);
+ v = vec_vsx_ld(col, src_ptr);
+ vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
+ out = apply_filter(ctx, v, filter);
+ vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
+ }
+
+ /* now post_proc_across */
+ left_ctx = vec_splats(dst_ptr[0]);
+ v = vec_vsx_ld(0, dst_ptr);
+ for (col = 0; col < cols - 8; col += 16) {
+ const uint8x16_t filter = vec_vsx_ld(col, f);
+ const uint8x16_t right_ctx = (col + 16 == cols)
+ ? vec_splats(dst_ptr[cols - 1])
+ : vec_vsx_ld(col, dst_ptr + 16);
+ horz_ctx(ctx, left_ctx, v, right_ctx);
+ vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
+ left_ctx = v;
+ v = right_ctx;
+ }
+
+ if (col != cols) {
+ const uint8x16_t filter = vec_vsx_ld(col, f);
+ const uint8x16_t right_ctx = vec_splats(dst_ptr[cols - 1]);
+ horz_ctx(ctx, left_ctx, v, right_ctx);
+ out = apply_filter(ctx, v, filter);
+ vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
+ }
+
+ src_ptr += src_pixels_per_line;
+ dst_ptr += dst_pixels_per_line;
+ }
+}
+
+// C: s[c + 7]
+static INLINE int16x8_t next7l_s16(uint8x16_t c) {
+ static const uint8x16_t next7_perm = {
+ 0x07, 0x10, 0x08, 0x11, 0x09, 0x12, 0x0A, 0x13,
+ 0x0B, 0x14, 0x0C, 0x15, 0x0D, 0x16, 0x0E, 0x17,
+ };
+ return (int16x8_t)vec_perm(c, vec_zeros_u8, next7_perm);
+}
+
+// Slide across window and add.
+static INLINE int16x8_t slide_sum_s16(int16x8_t x) {
+ // x = A B C D E F G H
+ //
+ // 0 A B C D E F G
+ const int16x8_t sum1 = vec_add(x, vec_slo(x, vec_splats((int8_t)(2 << 3))));
+ // 0 0 A B C D E F
+ const int16x8_t sum2 = vec_add(vec_slo(x, vec_splats((int8_t)(4 << 3))),
+ // 0 0 0 A B C D E
+ vec_slo(x, vec_splats((int8_t)(6 << 3))));
+ // 0 0 0 0 A B C D
+ const int16x8_t sum3 = vec_add(vec_slo(x, vec_splats((int8_t)(8 << 3))),
+ // 0 0 0 0 0 A B C
+ vec_slo(x, vec_splats((int8_t)(10 << 3))));
+ // 0 0 0 0 0 0 A B
+ const int16x8_t sum4 = vec_add(vec_slo(x, vec_splats((int8_t)(12 << 3))),
+ // 0 0 0 0 0 0 0 A
+ vec_slo(x, vec_splats((int8_t)(14 << 3))));
+ return vec_add(vec_add(sum1, sum2), vec_add(sum3, sum4));
+}
+
+// Slide across window and add.
+static INLINE int32x4_t slide_sumsq_s32(int32x4_t xsq_even, int32x4_t xsq_odd) {
+ // 0 A C E
+ // + 0 B D F
+ int32x4_t sumsq_1 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(4 << 3))),
+ vec_slo(xsq_odd, vec_splats((int8_t)(4 << 3))));
+ // 0 0 A C
+ // + 0 0 B D
+ int32x4_t sumsq_2 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(8 << 3))),
+ vec_slo(xsq_odd, vec_splats((int8_t)(8 << 3))));
+ // 0 0 0 A
+ // + 0 0 0 B
+ int32x4_t sumsq_3 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(12 << 3))),
+ vec_slo(xsq_odd, vec_splats((int8_t)(12 << 3))));
+ sumsq_1 = vec_add(sumsq_1, xsq_even);
+ sumsq_2 = vec_add(sumsq_2, sumsq_3);
+ return vec_add(sumsq_1, sumsq_2);
+}
+
+// C: (b + sum + val) >> 4
+static INLINE int16x8_t filter_s16(int16x8_t b, int16x8_t sum, int16x8_t val) {
+ return vec_sra(vec_add(vec_add(b, sum), val), vec_splats((uint16_t)4));
+}
+
+// C: sumsq * 15 - sum * sum
+static INLINE bool16x8_t mask_s16(int32x4_t sumsq_even, int32x4_t sumsq_odd,
+ int16x8_t sum, int32x4_t lim) {
+ static const uint8x16_t mask_merge = { 0x00, 0x01, 0x10, 0x11, 0x04, 0x05,
+ 0x14, 0x15, 0x08, 0x09, 0x18, 0x19,
+ 0x0C, 0x0D, 0x1C, 0x1D };
+ const int32x4_t sumsq_odd_scaled =
+ vec_mul(sumsq_odd, vec_splats((int32_t)15));
+ const int32x4_t sumsq_even_scaled =
+ vec_mul(sumsq_even, vec_splats((int32_t)15));
+ const int32x4_t thres_odd = vec_sub(sumsq_odd_scaled, vec_mulo(sum, sum));
+ const int32x4_t thres_even = vec_sub(sumsq_even_scaled, vec_mule(sum, sum));
+
+ const bool32x4_t mask_odd = vec_cmplt(thres_odd, lim);
+ const bool32x4_t mask_even = vec_cmplt(thres_even, lim);
+ return vec_perm((bool16x8_t)mask_even, (bool16x8_t)mask_odd, mask_merge);
+}
+
+void vpx_mbpost_proc_across_ip_vsx(unsigned char *src, int pitch, int rows,
+ int cols, int flimit) {
+ int row, col;
+ const int32x4_t lim = vec_splats(flimit);
+
+ // 8 columns are processed at a time.
+ assert(cols % 8 == 0);
+
+ for (row = 0; row < rows; row++) {
+ // The sum is signed and requires at most 13 bits.
+ // (8 bits + sign) * 15 (4 bits)
+ int16x8_t sum;
+ // The sum of squares requires at most 20 bits.
+ // (16 bits + sign) * 15 (4 bits)
+ int32x4_t sumsq_even, sumsq_odd;
+
+ // Fill left context with first col.
+ int16x8_t left_ctx = vec_splats((int16_t)src[0]);
+ int16_t s = src[0] * 9;
+ int32_t ssq = src[0] * src[0] * 9 + 16;
+
+ // Fill the next 6 columns of the sliding window with cols 2 to 7.
+ for (col = 1; col <= 6; ++col) {
+ s += src[col];
+ ssq += src[col] * src[col];
+ }
+ // Set this sum to every element in the window.
+ sum = vec_splats(s);
+ sumsq_even = vec_splats(ssq);
+ sumsq_odd = vec_splats(ssq);
+
+ for (col = 0; col < cols; col += 8) {
+ bool16x8_t mask;
+ int16x8_t filtered, masked;
+ uint8x16_t out;
+
+ const uint8x16_t val = vec_vsx_ld(0, src + col);
+ const int16x8_t val_high = unpack_to_s16_h(val);
+
+ // C: s[c + 7]
+ const int16x8_t right_ctx = (col + 8 == cols)
+ ? vec_splats((int16_t)src[col + 7])
+ : next7l_s16(val);
+
+ // C: x = s[c + 7] - s[c - 8];
+ const int16x8_t x = vec_sub(right_ctx, left_ctx);
+ const int32x4_t xsq_even =
+ vec_sub(vec_mule(right_ctx, right_ctx), vec_mule(left_ctx, left_ctx));
+ const int32x4_t xsq_odd =
+ vec_sub(vec_mulo(right_ctx, right_ctx), vec_mulo(left_ctx, left_ctx));
+
+ const int32x4_t sumsq_tmp = slide_sumsq_s32(xsq_even, xsq_odd);
+ // A C E G
+ // 0 B D F
+ // 0 A C E
+ // 0 0 B D
+ // 0 0 A C
+ // 0 0 0 B
+ // 0 0 0 A
+ sumsq_even = vec_add(sumsq_even, sumsq_tmp);
+ // B D F G
+ // A C E G
+ // 0 B D F
+ // 0 A C E
+ // 0 0 B D
+ // 0 0 A C
+ // 0 0 0 B
+ // 0 0 0 A
+ sumsq_odd = vec_add(sumsq_odd, vec_add(sumsq_tmp, xsq_odd));
+
+ sum = vec_add(sum, slide_sum_s16(x));
+
+ // C: (8 + sum + s[c]) >> 4
+ filtered = filter_s16(vec_splats((int16_t)8), sum, val_high);
+ // C: sumsq * 15 - sum * sum
+ mask = mask_s16(sumsq_even, sumsq_odd, sum, lim);
+ masked = vec_sel(val_high, filtered, mask);
+
+ out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, src + col), load_merge);
+ vec_vsx_st(out, 0, src + col);
+
+ // Update window sum and square sum
+ sum = vec_splat(sum, 7);
+ sumsq_even = vec_splat(sumsq_odd, 3);
+ sumsq_odd = vec_splat(sumsq_odd, 3);
+
+ // C: s[c - 8] (for next iteration)
+ left_ctx = val_high;
+ }
+ src += pitch;
+ }
+}
+
+void vpx_mbpost_proc_down_vsx(uint8_t *dst, int pitch, int rows, int cols,
+ int flimit) {
+ int col, row, i;
+ int16x8_t window[16];
+ const int32x4_t lim = vec_splats(flimit);
+
+ // 8 columns are processed at a time.
+ assert(cols % 8 == 0);
+ // If rows is less than 8 the bottom border extension fails.
+ assert(rows >= 8);
+
+ for (col = 0; col < cols; col += 8) {
+ // The sum is signed and requires at most 13 bits.
+ // (8 bits + sign) * 15 (4 bits)
+ int16x8_t r1, sum;
+ // The sum of squares requires at most 20 bits.
+ // (16 bits + sign) * 15 (4 bits)
+ int32x4_t sumsq_even, sumsq_odd;
+
+ r1 = unpack_to_s16_h(vec_vsx_ld(0, dst));
+ // Fill sliding window with first row.
+ for (i = 0; i <= 8; i++) {
+ window[i] = r1;
+ }
+ // First 9 rows of the sliding window are the same.
+ // sum = r1 * 9
+ sum = vec_mladd(r1, vec_splats((int16_t)9), vec_zeros_s16);
+
+ // sumsq = r1 * r1 * 9
+ sumsq_even = vec_mule(sum, r1);
+ sumsq_odd = vec_mulo(sum, r1);
+
+ // Fill the next 6 rows of the sliding window with rows 2 to 7.
+ for (i = 1; i <= 6; ++i) {
+ const int16x8_t next_row = unpack_to_s16_h(vec_vsx_ld(i * pitch, dst));
+ window[i + 8] = next_row;
+ sum = vec_add(sum, next_row);
+ sumsq_odd = vec_add(sumsq_odd, vec_mulo(next_row, next_row));
+ sumsq_even = vec_add(sumsq_even, vec_mule(next_row, next_row));
+ }
+
+ for (row = 0; row < rows; row++) {
+ int32x4_t d15_even, d15_odd, d0_even, d0_odd;
+ bool16x8_t mask;
+ int16x8_t filtered, masked;
+ uint8x16_t out;
+
+ const int16x8_t rv = vec_vsx_ld(0, vpx_rv + (row & 127));
+
+ // Move the sliding window
+ if (row + 7 < rows) {
+ window[15] = unpack_to_s16_h(vec_vsx_ld((row + 7) * pitch, dst));
+ } else {
+ window[15] = window[14];
+ }
+
+ // C: sum += s[7 * pitch] - s[-8 * pitch];
+ sum = vec_add(sum, vec_sub(window[15], window[0]));
+
+ // C: sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 *
+ // pitch];
+ // Optimization Note: Caching a squared-window for odd and even is
+ // slower than just repeating the multiplies.
+ d15_odd = vec_mulo(window[15], window[15]);
+ d15_even = vec_mule(window[15], window[15]);
+ d0_odd = vec_mulo(window[0], window[0]);
+ d0_even = vec_mule(window[0], window[0]);
+ sumsq_odd = vec_add(sumsq_odd, vec_sub(d15_odd, d0_odd));
+ sumsq_even = vec_add(sumsq_even, vec_sub(d15_even, d0_even));
+
+ // C: (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4
+ filtered = filter_s16(rv, sum, window[8]);
+
+ // C: sumsq * 15 - sum * sum
+ mask = mask_s16(sumsq_even, sumsq_odd, sum, lim);
+ masked = vec_sel(window[8], filtered, mask);
+
+ // TODO(ltrudeau) If cols % 16 == 0, we could just process 16 per
+ // iteration
+ out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, dst + row * pitch),
+ load_merge);
+ vec_vsx_st(out, 0, dst + row * pitch);
+
+ // Optimization Note: Turns out that the following loop is faster than
+ // using pointers to manage the sliding window.
+ for (i = 1; i < 16; i++) {
+ window[i - 1] = window[i];
+ }
+ }
+ dst += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c
new file mode 100644
index 0000000000..328b0e3130
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c
@@ -0,0 +1,553 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/ppc/transpose_vsx.h"
+#include "vpx_dsp/ppc/txfm_common_vsx.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Returns ((a +/- b) * cospi16 + (2 << 13)) >> 14.
+static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add,
+ int16x8_t *sub) {
+ // Since a + b can overflow 16 bits, the multiplication is distributed
+ // (a * c +/- b * c).
+ const int32x4_t ac_e = vec_mule(a, cospi16_v);
+ const int32x4_t ac_o = vec_mulo(a, cospi16_v);
+ const int32x4_t bc_e = vec_mule(b, cospi16_v);
+ const int32x4_t bc_o = vec_mulo(b, cospi16_v);
+
+ // Reuse the same multiplies for sum and difference.
+ const int32x4_t sum_e = vec_add(ac_e, bc_e);
+ const int32x4_t sum_o = vec_add(ac_o, bc_o);
+ const int32x4_t diff_e = vec_sub(ac_e, bc_e);
+ const int32x4_t diff_o = vec_sub(ac_o, bc_o);
+
+ // Add rounding offset
+ const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding);
+ const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding);
+ const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding);
+ const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding);
+
+ const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits);
+ const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits);
+ const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits);
+ const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
+
+ // There's no pack operation for even and odd, so we need to permute.
+ *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
+ *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
+}
+
+// Returns (a * c1 +/- b * c2 + (2 << 13)) >> 14
+static INLINE void double_butterfly(int16x8_t a, int16x8_t c1, int16x8_t b,
+ int16x8_t c2, int16x8_t *add,
+ int16x8_t *sub) {
+ const int32x4_t ac1_o = vec_mulo(a, c1);
+ const int32x4_t ac1_e = vec_mule(a, c1);
+ const int32x4_t ac2_o = vec_mulo(a, c2);
+ const int32x4_t ac2_e = vec_mule(a, c2);
+
+ const int32x4_t bc1_o = vec_mulo(b, c1);
+ const int32x4_t bc1_e = vec_mule(b, c1);
+ const int32x4_t bc2_o = vec_mulo(b, c2);
+ const int32x4_t bc2_e = vec_mule(b, c2);
+
+ const int32x4_t sum_o = vec_add(ac1_o, bc2_o);
+ const int32x4_t sum_e = vec_add(ac1_e, bc2_e);
+ const int32x4_t diff_o = vec_sub(ac2_o, bc1_o);
+ const int32x4_t diff_e = vec_sub(ac2_e, bc1_e);
+
+ // Add rounding offset
+ const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding);
+ const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding);
+ const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding);
+ const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding);
+
+ const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits);
+ const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits);
+ const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits);
+ const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
+
+ // There's no pack operation for even and odd, so we need to permute.
+ *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
+ *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
+}
+
+// While other architecture combine the load and the stage 1 operations, Power9
+// benchmarking show no benefit in such an approach.
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
+ // Tried out different combinations of load and shift instructions, this is
+ // the fastest one.
+ {
+ const int16x8_t l0 = vec_vsx_ld(0, a);
+ const int16x8_t l1 = vec_vsx_ld(0, a + stride);
+ const int16x8_t l2 = vec_vsx_ld(0, a + 2 * stride);
+ const int16x8_t l3 = vec_vsx_ld(0, a + 3 * stride);
+ const int16x8_t l4 = vec_vsx_ld(0, a + 4 * stride);
+ const int16x8_t l5 = vec_vsx_ld(0, a + 5 * stride);
+ const int16x8_t l6 = vec_vsx_ld(0, a + 6 * stride);
+ const int16x8_t l7 = vec_vsx_ld(0, a + 7 * stride);
+
+ const int16x8_t l8 = vec_vsx_ld(0, a + 8 * stride);
+ const int16x8_t l9 = vec_vsx_ld(0, a + 9 * stride);
+ const int16x8_t l10 = vec_vsx_ld(0, a + 10 * stride);
+ const int16x8_t l11 = vec_vsx_ld(0, a + 11 * stride);
+ const int16x8_t l12 = vec_vsx_ld(0, a + 12 * stride);
+ const int16x8_t l13 = vec_vsx_ld(0, a + 13 * stride);
+ const int16x8_t l14 = vec_vsx_ld(0, a + 14 * stride);
+ const int16x8_t l15 = vec_vsx_ld(0, a + 15 * stride);
+
+ b[0] = vec_sl(l0, vec_dct_scale_log2);
+ b[1] = vec_sl(l1, vec_dct_scale_log2);
+ b[2] = vec_sl(l2, vec_dct_scale_log2);
+ b[3] = vec_sl(l3, vec_dct_scale_log2);
+ b[4] = vec_sl(l4, vec_dct_scale_log2);
+ b[5] = vec_sl(l5, vec_dct_scale_log2);
+ b[6] = vec_sl(l6, vec_dct_scale_log2);
+ b[7] = vec_sl(l7, vec_dct_scale_log2);
+
+ b[8] = vec_sl(l8, vec_dct_scale_log2);
+ b[9] = vec_sl(l9, vec_dct_scale_log2);
+ b[10] = vec_sl(l10, vec_dct_scale_log2);
+ b[11] = vec_sl(l11, vec_dct_scale_log2);
+ b[12] = vec_sl(l12, vec_dct_scale_log2);
+ b[13] = vec_sl(l13, vec_dct_scale_log2);
+ b[14] = vec_sl(l14, vec_dct_scale_log2);
+ b[15] = vec_sl(l15, vec_dct_scale_log2);
+ }
+ {
+ const int16x8_t l16 = vec_vsx_ld(0, a + 16 * stride);
+ const int16x8_t l17 = vec_vsx_ld(0, a + 17 * stride);
+ const int16x8_t l18 = vec_vsx_ld(0, a + 18 * stride);
+ const int16x8_t l19 = vec_vsx_ld(0, a + 19 * stride);
+ const int16x8_t l20 = vec_vsx_ld(0, a + 20 * stride);
+ const int16x8_t l21 = vec_vsx_ld(0, a + 21 * stride);
+ const int16x8_t l22 = vec_vsx_ld(0, a + 22 * stride);
+ const int16x8_t l23 = vec_vsx_ld(0, a + 23 * stride);
+
+ const int16x8_t l24 = vec_vsx_ld(0, a + 24 * stride);
+ const int16x8_t l25 = vec_vsx_ld(0, a + 25 * stride);
+ const int16x8_t l26 = vec_vsx_ld(0, a + 26 * stride);
+ const int16x8_t l27 = vec_vsx_ld(0, a + 27 * stride);
+ const int16x8_t l28 = vec_vsx_ld(0, a + 28 * stride);
+ const int16x8_t l29 = vec_vsx_ld(0, a + 29 * stride);
+ const int16x8_t l30 = vec_vsx_ld(0, a + 30 * stride);
+ const int16x8_t l31 = vec_vsx_ld(0, a + 31 * stride);
+
+ b[16] = vec_sl(l16, vec_dct_scale_log2);
+ b[17] = vec_sl(l17, vec_dct_scale_log2);
+ b[18] = vec_sl(l18, vec_dct_scale_log2);
+ b[19] = vec_sl(l19, vec_dct_scale_log2);
+ b[20] = vec_sl(l20, vec_dct_scale_log2);
+ b[21] = vec_sl(l21, vec_dct_scale_log2);
+ b[22] = vec_sl(l22, vec_dct_scale_log2);
+ b[23] = vec_sl(l23, vec_dct_scale_log2);
+
+ b[24] = vec_sl(l24, vec_dct_scale_log2);
+ b[25] = vec_sl(l25, vec_dct_scale_log2);
+ b[26] = vec_sl(l26, vec_dct_scale_log2);
+ b[27] = vec_sl(l27, vec_dct_scale_log2);
+ b[28] = vec_sl(l28, vec_dct_scale_log2);
+ b[29] = vec_sl(l29, vec_dct_scale_log2);
+ b[30] = vec_sl(l30, vec_dct_scale_log2);
+ b[31] = vec_sl(l31, vec_dct_scale_log2);
+ }
+}
+
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+ vec_vsx_st(b[0], 0, a);
+ vec_vsx_st(b[8], 0, a + 8);
+ vec_vsx_st(b[16], 0, a + 16);
+ vec_vsx_st(b[24], 0, a + 24);
+
+ vec_vsx_st(b[1], 0, a + 32);
+ vec_vsx_st(b[9], 0, a + 40);
+ vec_vsx_st(b[17], 0, a + 48);
+ vec_vsx_st(b[25], 0, a + 56);
+
+ vec_vsx_st(b[2], 0, a + 64);
+ vec_vsx_st(b[10], 0, a + 72);
+ vec_vsx_st(b[18], 0, a + 80);
+ vec_vsx_st(b[26], 0, a + 88);
+
+ vec_vsx_st(b[3], 0, a + 96);
+ vec_vsx_st(b[11], 0, a + 104);
+ vec_vsx_st(b[19], 0, a + 112);
+ vec_vsx_st(b[27], 0, a + 120);
+
+ vec_vsx_st(b[4], 0, a + 128);
+ vec_vsx_st(b[12], 0, a + 136);
+ vec_vsx_st(b[20], 0, a + 144);
+ vec_vsx_st(b[28], 0, a + 152);
+
+ vec_vsx_st(b[5], 0, a + 160);
+ vec_vsx_st(b[13], 0, a + 168);
+ vec_vsx_st(b[21], 0, a + 176);
+ vec_vsx_st(b[29], 0, a + 184);
+
+ vec_vsx_st(b[6], 0, a + 192);
+ vec_vsx_st(b[14], 0, a + 200);
+ vec_vsx_st(b[22], 0, a + 208);
+ vec_vsx_st(b[30], 0, a + 216);
+
+ vec_vsx_st(b[7], 0, a + 224);
+ vec_vsx_st(b[15], 0, a + 232);
+ vec_vsx_st(b[23], 0, a + 240);
+ vec_vsx_st(b[31], 0, a + 248);
+}
+
+// Returns 1 if negative 0 if positive
+static INLINE int16x8_t vec_sign_s16(int16x8_t a) {
+ return vec_sr(a, vec_shift_sign_s16);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
+ const int16x8_t sign = vec_sign_s16(a);
+ return vec_sra(vec_sub(vec_add(a, vec_twos_s16), sign), vec_dct_scale_log2);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+ const int16x8_t sign = vec_sign_s16(a);
+ return vec_sra(vec_add(vec_add(a, vec_ones_s16), sign), vec_dct_scale_log2);
+}
+
+static void fdct32_vsx(const int16x8_t *in, int16x8_t *out, int pass) {
+ int16x8_t temp0[32]; // Hold stages: 1, 4, 7
+ int16x8_t temp1[32]; // Hold stages: 2, 5
+ int16x8_t temp2[32]; // Hold stages: 3, 6
+ int i;
+
+ // Stage 1
+ // Unrolling this loops actually slows down Power9 benchmarks
+ for (i = 0; i < 16; i++) {
+ temp0[i] = vec_add(in[i], in[31 - i]);
+ // pass through to stage 3.
+ temp1[i + 16] = vec_sub(in[15 - i], in[i + 16]);
+ }
+
+ // Stage 2
+ // Unrolling this loops actually slows down Power9 benchmarks
+ for (i = 0; i < 8; i++) {
+ temp1[i] = vec_add(temp0[i], temp0[15 - i]);
+ temp1[i + 8] = vec_sub(temp0[7 - i], temp0[i + 8]);
+ }
+
+ // Apply butterflies (in place) on pass through to stage 3.
+ single_butterfly(temp1[27], temp1[20], &temp1[27], &temp1[20]);
+ single_butterfly(temp1[26], temp1[21], &temp1[26], &temp1[21]);
+ single_butterfly(temp1[25], temp1[22], &temp1[25], &temp1[22]);
+ single_butterfly(temp1[24], temp1[23], &temp1[24], &temp1[23]);
+
+ // dump the magnitude by 4, hence the intermediate values are within
+ // the range of 16 bits.
+ if (pass) {
+ temp1[0] = add_round_shift_s16(temp1[0]);
+ temp1[1] = add_round_shift_s16(temp1[1]);
+ temp1[2] = add_round_shift_s16(temp1[2]);
+ temp1[3] = add_round_shift_s16(temp1[3]);
+ temp1[4] = add_round_shift_s16(temp1[4]);
+ temp1[5] = add_round_shift_s16(temp1[5]);
+ temp1[6] = add_round_shift_s16(temp1[6]);
+ temp1[7] = add_round_shift_s16(temp1[7]);
+ temp1[8] = add_round_shift_s16(temp1[8]);
+ temp1[9] = add_round_shift_s16(temp1[9]);
+ temp1[10] = add_round_shift_s16(temp1[10]);
+ temp1[11] = add_round_shift_s16(temp1[11]);
+ temp1[12] = add_round_shift_s16(temp1[12]);
+ temp1[13] = add_round_shift_s16(temp1[13]);
+ temp1[14] = add_round_shift_s16(temp1[14]);
+ temp1[15] = add_round_shift_s16(temp1[15]);
+
+ temp1[16] = add_round_shift_s16(temp1[16]);
+ temp1[17] = add_round_shift_s16(temp1[17]);
+ temp1[18] = add_round_shift_s16(temp1[18]);
+ temp1[19] = add_round_shift_s16(temp1[19]);
+ temp1[20] = add_round_shift_s16(temp1[20]);
+ temp1[21] = add_round_shift_s16(temp1[21]);
+ temp1[22] = add_round_shift_s16(temp1[22]);
+ temp1[23] = add_round_shift_s16(temp1[23]);
+ temp1[24] = add_round_shift_s16(temp1[24]);
+ temp1[25] = add_round_shift_s16(temp1[25]);
+ temp1[26] = add_round_shift_s16(temp1[26]);
+ temp1[27] = add_round_shift_s16(temp1[27]);
+ temp1[28] = add_round_shift_s16(temp1[28]);
+ temp1[29] = add_round_shift_s16(temp1[29]);
+ temp1[30] = add_round_shift_s16(temp1[30]);
+ temp1[31] = add_round_shift_s16(temp1[31]);
+ }
+
+ // Stage 3
+ temp2[0] = vec_add(temp1[0], temp1[7]);
+ temp2[1] = vec_add(temp1[1], temp1[6]);
+ temp2[2] = vec_add(temp1[2], temp1[5]);
+ temp2[3] = vec_add(temp1[3], temp1[4]);
+ temp2[5] = vec_sub(temp1[2], temp1[5]);
+ temp2[6] = vec_sub(temp1[1], temp1[6]);
+ temp2[8] = temp1[8];
+ temp2[9] = temp1[9];
+
+ single_butterfly(temp1[13], temp1[10], &temp2[13], &temp2[10]);
+ single_butterfly(temp1[12], temp1[11], &temp2[12], &temp2[11]);
+ temp2[14] = temp1[14];
+ temp2[15] = temp1[15];
+
+ temp2[18] = vec_add(temp1[18], temp1[21]);
+ temp2[19] = vec_add(temp1[19], temp1[20]);
+
+ temp2[20] = vec_sub(temp1[19], temp1[20]);
+ temp2[21] = vec_sub(temp1[18], temp1[21]);
+
+ temp2[26] = vec_sub(temp1[29], temp1[26]);
+ temp2[27] = vec_sub(temp1[28], temp1[27]);
+
+ temp2[28] = vec_add(temp1[28], temp1[27]);
+ temp2[29] = vec_add(temp1[29], temp1[26]);
+
+ // Pass through Stage 4
+ temp0[7] = vec_sub(temp1[0], temp1[7]);
+ temp0[4] = vec_sub(temp1[3], temp1[4]);
+ temp0[16] = vec_add(temp1[16], temp1[23]);
+ temp0[17] = vec_add(temp1[17], temp1[22]);
+ temp0[22] = vec_sub(temp1[17], temp1[22]);
+ temp0[23] = vec_sub(temp1[16], temp1[23]);
+ temp0[24] = vec_sub(temp1[31], temp1[24]);
+ temp0[25] = vec_sub(temp1[30], temp1[25]);
+ temp0[30] = vec_add(temp1[30], temp1[25]);
+ temp0[31] = vec_add(temp1[31], temp1[24]);
+
+ // Stage 4
+ temp0[0] = vec_add(temp2[0], temp2[3]);
+ temp0[1] = vec_add(temp2[1], temp2[2]);
+ temp0[2] = vec_sub(temp2[1], temp2[2]);
+ temp0[3] = vec_sub(temp2[0], temp2[3]);
+ single_butterfly(temp2[6], temp2[5], &temp0[6], &temp0[5]);
+
+ temp0[9] = vec_add(temp2[9], temp2[10]);
+ temp0[10] = vec_sub(temp2[9], temp2[10]);
+ temp0[13] = vec_sub(temp2[14], temp2[13]);
+ temp0[14] = vec_add(temp2[14], temp2[13]);
+
+ double_butterfly(temp2[29], cospi8_v, temp2[18], cospi24_v, &temp0[29],
+ &temp0[18]);
+ double_butterfly(temp2[28], cospi8_v, temp2[19], cospi24_v, &temp0[28],
+ &temp0[19]);
+ double_butterfly(temp2[27], cospi24_v, temp2[20], cospi8m_v, &temp0[27],
+ &temp0[20]);
+ double_butterfly(temp2[26], cospi24_v, temp2[21], cospi8m_v, &temp0[26],
+ &temp0[21]);
+
+ // Pass through Stage 5
+ temp1[8] = vec_add(temp2[8], temp2[11]);
+ temp1[11] = vec_sub(temp2[8], temp2[11]);
+ temp1[12] = vec_sub(temp2[15], temp2[12]);
+ temp1[15] = vec_add(temp2[15], temp2[12]);
+
+ // Stage 5
+ // 0 and 1 pass through to 0 and 16 at the end
+ single_butterfly(temp0[0], temp0[1], &out[0], &out[16]);
+
+ // 2 and 3 pass through to 8 and 24 at the end
+ double_butterfly(temp0[3], cospi8_v, temp0[2], cospi24_v, &out[8], &out[24]);
+
+ temp1[4] = vec_add(temp0[4], temp0[5]);
+ temp1[5] = vec_sub(temp0[4], temp0[5]);
+ temp1[6] = vec_sub(temp0[7], temp0[6]);
+ temp1[7] = vec_add(temp0[7], temp0[6]);
+
+ double_butterfly(temp0[14], cospi8_v, temp0[9], cospi24_v, &temp1[14],
+ &temp1[9]);
+ double_butterfly(temp0[13], cospi24_v, temp0[10], cospi8m_v, &temp1[13],
+ &temp1[10]);
+
+ temp1[17] = vec_add(temp0[17], temp0[18]);
+ temp1[18] = vec_sub(temp0[17], temp0[18]);
+
+ temp1[21] = vec_sub(temp0[22], temp0[21]);
+ temp1[22] = vec_add(temp0[22], temp0[21]);
+
+ temp1[25] = vec_add(temp0[25], temp0[26]);
+ temp1[26] = vec_sub(temp0[25], temp0[26]);
+
+ temp1[29] = vec_sub(temp0[30], temp0[29]);
+ temp1[30] = vec_add(temp0[30], temp0[29]);
+
+ // Pass through Stage 6
+ temp2[16] = vec_add(temp0[16], temp0[19]);
+ temp2[19] = vec_sub(temp0[16], temp0[19]);
+ temp2[20] = vec_sub(temp0[23], temp0[20]);
+ temp2[23] = vec_add(temp0[23], temp0[20]);
+ temp2[24] = vec_add(temp0[24], temp0[27]);
+ temp2[27] = vec_sub(temp0[24], temp0[27]);
+ temp2[28] = vec_sub(temp0[31], temp0[28]);
+ temp2[31] = vec_add(temp0[31], temp0[28]);
+
+ // Stage 6
+ // 4 and 7 pass through to 4 and 28 at the end
+ double_butterfly(temp1[7], cospi4_v, temp1[4], cospi28_v, &out[4], &out[28]);
+ // 5 and 6 pass through to 20 and 12 at the end
+ double_butterfly(temp1[6], cospi20_v, temp1[5], cospi12_v, &out[20],
+ &out[12]);
+ temp2[8] = vec_add(temp1[8], temp1[9]);
+ temp2[9] = vec_sub(temp1[8], temp1[9]);
+ temp2[10] = vec_sub(temp1[11], temp1[10]);
+ temp2[11] = vec_add(temp1[11], temp1[10]);
+ temp2[12] = vec_add(temp1[12], temp1[13]);
+ temp2[13] = vec_sub(temp1[12], temp1[13]);
+ temp2[14] = vec_sub(temp1[15], temp1[14]);
+ temp2[15] = vec_add(temp1[15], temp1[14]);
+
+ double_butterfly(temp1[30], cospi4_v, temp1[17], cospi28_v, &temp2[30],
+ &temp2[17]);
+ double_butterfly(temp1[29], cospi28_v, temp1[18], cospi4m_v, &temp2[29],
+ &temp2[18]);
+ double_butterfly(temp1[26], cospi20_v, temp1[21], cospi12_v, &temp2[26],
+ &temp2[21]);
+ double_butterfly(temp1[25], cospi12_v, temp1[22], cospi20m_v, &temp2[25],
+ &temp2[22]);
+
+ // Stage 7
+ double_butterfly(temp2[15], cospi2_v, temp2[8], cospi30_v, &out[2], &out[30]);
+ double_butterfly(temp2[14], cospi18_v, temp2[9], cospi14_v, &out[18],
+ &out[14]);
+ double_butterfly(temp2[13], cospi10_v, temp2[10], cospi22_v, &out[10],
+ &out[22]);
+ double_butterfly(temp2[12], cospi26_v, temp2[11], cospi6_v, &out[26],
+ &out[6]);
+
+ temp0[16] = vec_add(temp2[16], temp2[17]);
+ temp0[17] = vec_sub(temp2[16], temp2[17]);
+ temp0[18] = vec_sub(temp2[19], temp2[18]);
+ temp0[19] = vec_add(temp2[19], temp2[18]);
+ temp0[20] = vec_add(temp2[20], temp2[21]);
+ temp0[21] = vec_sub(temp2[20], temp2[21]);
+ temp0[22] = vec_sub(temp2[23], temp2[22]);
+ temp0[23] = vec_add(temp2[23], temp2[22]);
+ temp0[24] = vec_add(temp2[24], temp2[25]);
+ temp0[25] = vec_sub(temp2[24], temp2[25]);
+ temp0[26] = vec_sub(temp2[27], temp2[26]);
+ temp0[27] = vec_add(temp2[27], temp2[26]);
+ temp0[28] = vec_add(temp2[28], temp2[29]);
+ temp0[29] = vec_sub(temp2[28], temp2[29]);
+ temp0[30] = vec_sub(temp2[31], temp2[30]);
+ temp0[31] = vec_add(temp2[31], temp2[30]);
+
+ // Final stage --- outputs indices are bit-reversed.
+ double_butterfly(temp0[31], cospi1_v, temp0[16], cospi31_v, &out[1],
+ &out[31]);
+ double_butterfly(temp0[30], cospi17_v, temp0[17], cospi15_v, &out[17],
+ &out[15]);
+ double_butterfly(temp0[29], cospi9_v, temp0[18], cospi23_v, &out[9],
+ &out[23]);
+ double_butterfly(temp0[28], cospi25_v, temp0[19], cospi7_v, &out[25],
+ &out[7]);
+ double_butterfly(temp0[27], cospi5_v, temp0[20], cospi27_v, &out[5],
+ &out[27]);
+ double_butterfly(temp0[26], cospi21_v, temp0[21], cospi11_v, &out[21],
+ &out[11]);
+ double_butterfly(temp0[25], cospi13_v, temp0[22], cospi19_v, &out[13],
+ &out[19]);
+ double_butterfly(temp0[24], cospi29_v, temp0[23], cospi3_v, &out[29],
+ &out[3]);
+
+ if (pass == 0) {
+ for (i = 0; i < 32; i++) {
+ out[i] = sub_round_shift(out[i]);
+ }
+ }
+}
+
+void vpx_fdct32x32_rd_vsx(const int16_t *input, tran_low_t *out, int stride) {
+ int16x8_t temp0[32];
+ int16x8_t temp1[32];
+ int16x8_t temp2[32];
+ int16x8_t temp3[32];
+ int16x8_t temp4[32];
+ int16x8_t temp5[32];
+ int16x8_t temp6[32];
+
+ // Process in 8x32 columns.
+ load(input, stride, temp0);
+ fdct32_vsx(temp0, temp1, 0);
+
+ load(input + 8, stride, temp0);
+ fdct32_vsx(temp0, temp2, 0);
+
+ load(input + 16, stride, temp0);
+ fdct32_vsx(temp0, temp3, 0);
+
+ load(input + 24, stride, temp0);
+ fdct32_vsx(temp0, temp4, 0);
+
+ // Generate the top row by munging the first set of 8 from each one
+ // together.
+ transpose_8x8(&temp1[0], &temp0[0]);
+ transpose_8x8(&temp2[0], &temp0[8]);
+ transpose_8x8(&temp3[0], &temp0[16]);
+ transpose_8x8(&temp4[0], &temp0[24]);
+
+ fdct32_vsx(temp0, temp5, 1);
+
+ transpose_8x8(&temp5[0], &temp6[0]);
+ transpose_8x8(&temp5[8], &temp6[8]);
+ transpose_8x8(&temp5[16], &temp6[16]);
+ transpose_8x8(&temp5[24], &temp6[24]);
+
+ store(out, temp6);
+
+ // Second row of 8x32.
+ transpose_8x8(&temp1[8], &temp0[0]);
+ transpose_8x8(&temp2[8], &temp0[8]);
+ transpose_8x8(&temp3[8], &temp0[16]);
+ transpose_8x8(&temp4[8], &temp0[24]);
+
+ fdct32_vsx(temp0, temp5, 1);
+
+ transpose_8x8(&temp5[0], &temp6[0]);
+ transpose_8x8(&temp5[8], &temp6[8]);
+ transpose_8x8(&temp5[16], &temp6[16]);
+ transpose_8x8(&temp5[24], &temp6[24]);
+
+ store(out + 8 * 32, temp6);
+
+ // Third row of 8x32
+ transpose_8x8(&temp1[16], &temp0[0]);
+ transpose_8x8(&temp2[16], &temp0[8]);
+ transpose_8x8(&temp3[16], &temp0[16]);
+ transpose_8x8(&temp4[16], &temp0[24]);
+
+ fdct32_vsx(temp0, temp5, 1);
+
+ transpose_8x8(&temp5[0], &temp6[0]);
+ transpose_8x8(&temp5[8], &temp6[8]);
+ transpose_8x8(&temp5[16], &temp6[16]);
+ transpose_8x8(&temp5[24], &temp6[24]);
+
+ store(out + 16 * 32, temp6);
+
+ // Final row of 8x32.
+ transpose_8x8(&temp1[24], &temp0[0]);
+ transpose_8x8(&temp2[24], &temp0[8]);
+ transpose_8x8(&temp3[24], &temp0[16]);
+ transpose_8x8(&temp4[24], &temp0[24]);
+
+ fdct32_vsx(temp0, temp5, 1);
+
+ transpose_8x8(&temp5[0], &temp6[0]);
+ transpose_8x8(&temp5[8], &temp6[8]);
+ transpose_8x8(&temp5[16], &temp6[16]);
+ transpose_8x8(&temp5[24], &temp6[24]);
+
+ store(out + 24 * 32, temp6);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c
new file mode 100644
index 0000000000..e279b30478
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+#include "vpx_dsp/ppc/transpose_vsx.h"
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+
+static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) {
+ const int16x8_t b0 = vec_add(v[0], v[1]);
+ const int16x8_t b1 = vec_sub(v[0], v[1]);
+ const int16x8_t b2 = vec_add(v[2], v[3]);
+ const int16x8_t b3 = vec_sub(v[2], v[3]);
+ const int16x8_t b4 = vec_add(v[4], v[5]);
+ const int16x8_t b5 = vec_sub(v[4], v[5]);
+ const int16x8_t b6 = vec_add(v[6], v[7]);
+ const int16x8_t b7 = vec_sub(v[6], v[7]);
+
+ const int16x8_t c0 = vec_add(b0, b2);
+ const int16x8_t c1 = vec_add(b1, b3);
+ const int16x8_t c2 = vec_sub(b0, b2);
+ const int16x8_t c3 = vec_sub(b1, b3);
+ const int16x8_t c4 = vec_add(b4, b6);
+ const int16x8_t c5 = vec_add(b5, b7);
+ const int16x8_t c6 = vec_sub(b4, b6);
+ const int16x8_t c7 = vec_sub(b5, b7);
+
+ v[0] = vec_add(c0, c4);
+ v[1] = vec_sub(c2, c6);
+ v[2] = vec_sub(c0, c4);
+ v[3] = vec_add(c2, c6);
+ v[4] = vec_add(c3, c7);
+ v[5] = vec_sub(c3, c7);
+ v[6] = vec_sub(c1, c5);
+ v[7] = vec_add(c1, c5);
+}
+
+void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x8_t v[8];
+
+ v[0] = vec_vsx_ld(0, src_diff);
+ v[1] = vec_vsx_ld(0, src_diff + src_stride);
+ v[2] = vec_vsx_ld(0, src_diff + (2 * src_stride));
+ v[3] = vec_vsx_ld(0, src_diff + (3 * src_stride));
+ v[4] = vec_vsx_ld(0, src_diff + (4 * src_stride));
+ v[5] = vec_vsx_ld(0, src_diff + (5 * src_stride));
+ v[6] = vec_vsx_ld(0, src_diff + (6 * src_stride));
+ v[7] = vec_vsx_ld(0, src_diff + (7 * src_stride));
+
+ vpx_hadamard_s16_8x8_one_pass(v);
+
+ vpx_transpose_s16_8x8(v);
+
+ vpx_hadamard_s16_8x8_one_pass(v);
+
+ store_tran_low(v[0], 0, coeff);
+ store_tran_low(v[1], 0, coeff + 8);
+ store_tran_low(v[2], 0, coeff + 16);
+ store_tran_low(v[3], 0, coeff + 24);
+ store_tran_low(v[4], 0, coeff + 32);
+ store_tran_low(v[5], 0, coeff + 40);
+ store_tran_low(v[6], 0, coeff + 48);
+ store_tran_low(v[7], 0, coeff + 56);
+}
+
+void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int i;
+ const uint16x8_t ones = vec_splat_u16(1);
+
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ vpx_hadamard_8x8_vsx(src_diff, src_stride, coeff);
+ /* Top right. */
+ vpx_hadamard_8x8_vsx(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+ /* Bottom left. */
+ vpx_hadamard_8x8_vsx(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+ /* Bottom right. */
+ vpx_hadamard_8x8_vsx(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+ /* Overlay the 8x8 blocks and combine. */
+ for (i = 0; i < 64; i += 8) {
+ const int16x8_t a0 = load_tran_low(0, coeff);
+ const int16x8_t a1 = load_tran_low(0, coeff + 64);
+ const int16x8_t a2 = load_tran_low(0, coeff + 128);
+ const int16x8_t a3 = load_tran_low(0, coeff + 192);
+
+ /* Prevent the result from escaping int16_t. */
+ const int16x8_t b0 = vec_sra(a0, ones);
+ const int16x8_t b1 = vec_sra(a1, ones);
+ const int16x8_t b2 = vec_sra(a2, ones);
+ const int16x8_t b3 = vec_sra(a3, ones);
+
+ const int16x8_t c0 = vec_add(b0, b1);
+ const int16x8_t c2 = vec_add(b2, b3);
+ const int16x8_t c1 = vec_sub(b0, b1);
+ const int16x8_t c3 = vec_sub(b2, b3);
+
+ const int16x8_t d0 = vec_add(c0, c2);
+ const int16x8_t d1 = vec_add(c1, c3);
+ const int16x8_t d2 = vec_sub(c0, c2);
+ const int16x8_t d3 = vec_sub(c1, c3);
+
+ store_tran_low(d0, 0, coeff);
+ store_tran_low(d1, 0, coeff + 64);
+ store_tran_low(d2, 0, coeff + 128);
+ store_tran_low(d3, 0, coeff + 192);
+
+ coeff += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c
new file mode 100644
index 0000000000..a4c8322ff2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d = vec_vsx_ld(0, above);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 16; i++, dst += stride) {
+ vec_vsx_st(d, 0, dst);
+ }
+}
+
+void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vec_vsx_ld(0, above);
+ const uint8x16_t d1 = vec_vsx_ld(16, above);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 32; i++, dst += stride) {
+ vec_vsx_st(d0, 0, dst);
+ vec_vsx_st(d1, 16, dst);
+ }
+}
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+
+void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d = vec_vsx_ld(0, left);
+ const uint8x16_t v0 = vec_splat(d, 0);
+ const uint8x16_t v1 = vec_splat(d, 1);
+ const uint8x16_t v2 = vec_splat(d, 2);
+ const uint8x16_t v3 = vec_splat(d, 3);
+
+ (void)above;
+
+ vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+ vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+ vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+ vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+}
+
+void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d = vec_vsx_ld(0, left);
+ const uint8x16_t v0 = vec_splat(d, 0);
+ const uint8x16_t v1 = vec_splat(d, 1);
+ const uint8x16_t v2 = vec_splat(d, 2);
+ const uint8x16_t v3 = vec_splat(d, 3);
+
+ const uint8x16_t v4 = vec_splat(d, 4);
+ const uint8x16_t v5 = vec_splat(d, 5);
+ const uint8x16_t v6 = vec_splat(d, 6);
+ const uint8x16_t v7 = vec_splat(d, 7);
+
+ (void)above;
+
+ vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst);
+}
+#endif
+
+void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d = vec_vsx_ld(0, left);
+ const uint8x16_t v0 = vec_splat(d, 0);
+ const uint8x16_t v1 = vec_splat(d, 1);
+ const uint8x16_t v2 = vec_splat(d, 2);
+ const uint8x16_t v3 = vec_splat(d, 3);
+
+ const uint8x16_t v4 = vec_splat(d, 4);
+ const uint8x16_t v5 = vec_splat(d, 5);
+ const uint8x16_t v6 = vec_splat(d, 6);
+ const uint8x16_t v7 = vec_splat(d, 7);
+
+ const uint8x16_t v8 = vec_splat(d, 8);
+ const uint8x16_t v9 = vec_splat(d, 9);
+ const uint8x16_t v10 = vec_splat(d, 10);
+ const uint8x16_t v11 = vec_splat(d, 11);
+
+ const uint8x16_t v12 = vec_splat(d, 12);
+ const uint8x16_t v13 = vec_splat(d, 13);
+ const uint8x16_t v14 = vec_splat(d, 14);
+ const uint8x16_t v15 = vec_splat(d, 15);
+
+ (void)above;
+
+ vec_vsx_st(v0, 0, dst);
+ dst += stride;
+ vec_vsx_st(v1, 0, dst);
+ dst += stride;
+ vec_vsx_st(v2, 0, dst);
+ dst += stride;
+ vec_vsx_st(v3, 0, dst);
+ dst += stride;
+ vec_vsx_st(v4, 0, dst);
+ dst += stride;
+ vec_vsx_st(v5, 0, dst);
+ dst += stride;
+ vec_vsx_st(v6, 0, dst);
+ dst += stride;
+ vec_vsx_st(v7, 0, dst);
+ dst += stride;
+ vec_vsx_st(v8, 0, dst);
+ dst += stride;
+ vec_vsx_st(v9, 0, dst);
+ dst += stride;
+ vec_vsx_st(v10, 0, dst);
+ dst += stride;
+ vec_vsx_st(v11, 0, dst);
+ dst += stride;
+ vec_vsx_st(v12, 0, dst);
+ dst += stride;
+ vec_vsx_st(v13, 0, dst);
+ dst += stride;
+ vec_vsx_st(v14, 0, dst);
+ dst += stride;
+ vec_vsx_st(v15, 0, dst);
+}
+
+#define H_PREDICTOR_32(v) \
+ vec_vsx_st(v, 0, dst); \
+ vec_vsx_st(v, 16, dst); \
+ dst += stride
+
+void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vec_vsx_ld(0, left);
+ const uint8x16_t d1 = vec_vsx_ld(16, left);
+
+ const uint8x16_t v0_0 = vec_splat(d0, 0);
+ const uint8x16_t v1_0 = vec_splat(d0, 1);
+ const uint8x16_t v2_0 = vec_splat(d0, 2);
+ const uint8x16_t v3_0 = vec_splat(d0, 3);
+ const uint8x16_t v4_0 = vec_splat(d0, 4);
+ const uint8x16_t v5_0 = vec_splat(d0, 5);
+ const uint8x16_t v6_0 = vec_splat(d0, 6);
+ const uint8x16_t v7_0 = vec_splat(d0, 7);
+ const uint8x16_t v8_0 = vec_splat(d0, 8);
+ const uint8x16_t v9_0 = vec_splat(d0, 9);
+ const uint8x16_t v10_0 = vec_splat(d0, 10);
+ const uint8x16_t v11_0 = vec_splat(d0, 11);
+ const uint8x16_t v12_0 = vec_splat(d0, 12);
+ const uint8x16_t v13_0 = vec_splat(d0, 13);
+ const uint8x16_t v14_0 = vec_splat(d0, 14);
+ const uint8x16_t v15_0 = vec_splat(d0, 15);
+
+ const uint8x16_t v0_1 = vec_splat(d1, 0);
+ const uint8x16_t v1_1 = vec_splat(d1, 1);
+ const uint8x16_t v2_1 = vec_splat(d1, 2);
+ const uint8x16_t v3_1 = vec_splat(d1, 3);
+ const uint8x16_t v4_1 = vec_splat(d1, 4);
+ const uint8x16_t v5_1 = vec_splat(d1, 5);
+ const uint8x16_t v6_1 = vec_splat(d1, 6);
+ const uint8x16_t v7_1 = vec_splat(d1, 7);
+ const uint8x16_t v8_1 = vec_splat(d1, 8);
+ const uint8x16_t v9_1 = vec_splat(d1, 9);
+ const uint8x16_t v10_1 = vec_splat(d1, 10);
+ const uint8x16_t v11_1 = vec_splat(d1, 11);
+ const uint8x16_t v12_1 = vec_splat(d1, 12);
+ const uint8x16_t v13_1 = vec_splat(d1, 13);
+ const uint8x16_t v14_1 = vec_splat(d1, 14);
+ const uint8x16_t v15_1 = vec_splat(d1, 15);
+
+ (void)above;
+
+ H_PREDICTOR_32(v0_0);
+ H_PREDICTOR_32(v1_0);
+ H_PREDICTOR_32(v2_0);
+ H_PREDICTOR_32(v3_0);
+
+ H_PREDICTOR_32(v4_0);
+ H_PREDICTOR_32(v5_0);
+ H_PREDICTOR_32(v6_0);
+ H_PREDICTOR_32(v7_0);
+
+ H_PREDICTOR_32(v8_0);
+ H_PREDICTOR_32(v9_0);
+ H_PREDICTOR_32(v10_0);
+ H_PREDICTOR_32(v11_0);
+
+ H_PREDICTOR_32(v12_0);
+ H_PREDICTOR_32(v13_0);
+ H_PREDICTOR_32(v14_0);
+ H_PREDICTOR_32(v15_0);
+
+ H_PREDICTOR_32(v0_1);
+ H_PREDICTOR_32(v1_1);
+ H_PREDICTOR_32(v2_1);
+ H_PREDICTOR_32(v3_1);
+
+ H_PREDICTOR_32(v4_1);
+ H_PREDICTOR_32(v5_1);
+ H_PREDICTOR_32(v6_1);
+ H_PREDICTOR_32(v7_1);
+
+ H_PREDICTOR_32(v8_1);
+ H_PREDICTOR_32(v9_1);
+ H_PREDICTOR_32(v10_1);
+ H_PREDICTOR_32(v11_1);
+
+ H_PREDICTOR_32(v12_1);
+ H_PREDICTOR_32(v13_1);
+ H_PREDICTOR_32(v14_1);
+ H_PREDICTOR_32(v15_1);
+}
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+ const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
+ const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
+ int16x8_t tmp, val;
+ uint8x16_t d;
+
+ d = vec_vsx_ld(0, dst);
+ tmp = unpack_to_s16_l(d);
+ val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
+ vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+
+ d = vec_vsx_ld(0, dst);
+ tmp = unpack_to_s16_l(d);
+ val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
+ vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+
+ d = vec_vsx_ld(0, dst);
+ tmp = unpack_to_s16_l(d);
+ val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
+ vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+
+ d = vec_vsx_ld(0, dst);
+ tmp = unpack_to_s16_l(d);
+ val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
+ vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+}
+
+void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+ const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
+ const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
+ int16x8_t tmp, val;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+}
+#endif
+
+static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
+ int16x8_t ah, int16x8_t al, int16x8_t tl) {
+ int16x8_t vh, vl, ls;
+
+ ls = vec_splat(l, 0);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 1);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 2);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 3);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 4);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 5);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 6);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 7);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+}
+
+void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+ const uint8x16_t l = vec_vsx_ld(0, left);
+ const int16x8_t lh = unpack_to_s16_h(l);
+ const int16x8_t ll = unpack_to_s16_l(l);
+ const uint8x16_t a = vec_vsx_ld(0, above);
+ const int16x8_t ah = unpack_to_s16_h(a);
+ const int16x8_t al = unpack_to_s16_l(a);
+
+ tm_predictor_16x8(dst, stride, lh, ah, al, tl);
+
+ dst += stride * 8;
+
+ tm_predictor_16x8(dst, stride, ll, ah, al, tl);
+}
+
+static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
+ const int16x8_t a0h, const int16x8_t a0l,
+ const int16x8_t a1h, const int16x8_t a1l,
+ const int16x8_t tl) {
+ int16x8_t vh, vl;
+
+ vh = vec_sub(vec_add(ls, a0h), tl);
+ vl = vec_sub(vec_add(ls, a0l), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ vh = vec_sub(vec_add(ls, a1h), tl);
+ vl = vec_sub(vec_add(ls, a1l), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 16, dst);
+}
+
+static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
+ const int16x8_t l, const uint8x16_t a0,
+ const uint8x16_t a1, const int16x8_t tl) {
+ const int16x8_t a0h = unpack_to_s16_h(a0);
+ const int16x8_t a0l = unpack_to_s16_l(a0);
+ const int16x8_t a1h = unpack_to_s16_h(a1);
+ const int16x8_t a1l = unpack_to_s16_l(a1);
+
+ tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
+}
+
+void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+ const uint8x16_t l0 = vec_vsx_ld(0, left);
+ const uint8x16_t l1 = vec_vsx_ld(16, left);
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t a1 = vec_vsx_ld(16, above);
+
+ tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
+ dst += stride * 8;
+
+ tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
+ dst += stride * 8;
+
+ tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
+ dst += stride * 8;
+
+ tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
+}
+
+static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
+ const uint8x16_t val) {
+ int i;
+
+ for (i = 0; i < 8; i++, dst += stride) {
+ const uint8x16_t d = vec_vsx_ld(0, dst);
+ vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
+ }
+}
+
+static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
+ const uint8x16_t val) {
+ int i;
+
+ for (i = 0; i < 16; i++, dst += stride) {
+ vec_vsx_st(val, 0, dst);
+ }
+}
+
+void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
+ (void)above;
+ (void)left;
+
+ dc_fill_predictor_16x16(dst, stride, v128);
+}
+
+static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
+ const uint8x16_t val) {
+ int i;
+
+ for (i = 0; i < 32; i++, dst += stride) {
+ vec_vsx_st(val, 0, dst);
+ vec_vsx_st(val, 16, dst);
+ }
+}
+
+void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
+ (void)above;
+ (void)left;
+
+ dc_fill_predictor_32x32(dst, stride, v128);
+}
+
+static uint8x16_t avg16(const uint8_t *values) {
+ const int32x4_t sum4s =
+ (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
+ const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
+ const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
+
+ return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+ 3);
+}
+
+void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+
+ dc_fill_predictor_16x16(dst, stride, avg16(left));
+}
+
+void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ dc_fill_predictor_16x16(dst, stride, avg16(above));
+}
+
+static uint8x16_t avg32(const uint8_t *values) {
+ const uint8x16_t v0 = vec_vsx_ld(0, values);
+ const uint8x16_t v1 = vec_vsx_ld(16, values);
+ const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
+ const int32x4_t sum4s =
+ (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
+ const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
+ const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
+
+ return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+ 3);
+}
+
+void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+
+ dc_fill_predictor_32x32(dst, stride, avg32(left));
+}
+
+void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ dc_fill_predictor_32x32(dst, stride, avg32(above));
+}
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t l0 = vec_vsx_ld(0, left);
+ const int32x4_t sum4s =
+ (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
+ const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
+ const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
+ const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
+
+ return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+ 3);
+}
+#endif
+
+static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t l0 = vec_vsx_ld(0, left);
+ const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
+ const int32x4_t sum4s =
+ (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
+ const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
+ const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
+
+ return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+ 3);
+}
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
+}
+#endif
+
+void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
+}
+
+static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t a1 = vec_vsx_ld(16, above);
+ const uint8x16_t l0 = vec_vsx_ld(0, left);
+ const uint8x16_t l1 = vec_vsx_ld(16, left);
+ const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
+ const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
+ const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
+ const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
+ const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
+
+ return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+ 3);
+}
+
+void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
+}
+
+static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
+ const uint8x16_t c) {
+ const uint8x16_t ac =
+ vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
+
+ return vec_avg(ac, b);
+}
+
+// Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
+static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+ 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t af = vec_vsx_ld(0, above);
+ const uint8x16_t above_right = vec_splat(af, 7);
+ const uint8x16_t a = xxpermdi(af, above_right, 1);
+ const uint8x16_t b = vec_perm(a, above_right, sl1);
+ const uint8x16_t c = vec_perm(b, above_right, sl1);
+ uint8x16_t row = avg3(a, b, c);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 8; i++) {
+ const uint8x16_t d = vec_vsx_ld(0, dst);
+ vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
+ dst += stride;
+ row = vec_perm(row, above_right, sl1);
+ }
+}
+#endif
+
+void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a = vec_vsx_ld(0, above);
+ const uint8x16_t above_right = vec_splat(a, 15);
+ const uint8x16_t b = vec_perm(a, above_right, sl1);
+ const uint8x16_t c = vec_perm(b, above_right, sl1);
+ uint8x16_t row = avg3(a, b, c);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 16; i++) {
+ vec_vsx_st(row, 0, dst);
+ dst += stride;
+ row = vec_perm(row, above_right, sl1);
+ }
+}
+
+void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t a1 = vec_vsx_ld(16, above);
+ const uint8x16_t above_right = vec_splat(a1, 15);
+ const uint8x16_t b0 = vec_perm(a0, a1, sl1);
+ const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
+ const uint8x16_t c0 = vec_perm(b0, b1, sl1);
+ const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
+ uint8x16_t row0 = avg3(a0, b0, c0);
+ uint8x16_t row1 = avg3(a1, b1, c1);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 32; i++) {
+ vec_vsx_st(row0, 0, dst);
+ vec_vsx_st(row1, 16, dst);
+ dst += stride;
+ row0 = vec_perm(row0, row1, sl1);
+ row1 = vec_perm(row1, above_right, sl1);
+ }
+}
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t af = vec_vsx_ld(0, above);
+ const uint8x16_t above_right = vec_splat(af, 9);
+ const uint8x16_t a = xxpermdi(af, above_right, 1);
+ const uint8x16_t b = vec_perm(a, above_right, sl1);
+ const uint8x16_t c = vec_perm(b, above_right, sl1);
+ uint8x16_t row0 = vec_avg(a, b);
+ uint8x16_t row1 = avg3(a, b, c);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 4; i++) {
+ const uint8x16_t d0 = vec_vsx_ld(0, dst);
+ const uint8x16_t d1 = vec_vsx_ld(0, dst + stride);
+ vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst);
+ vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride);
+ dst += stride * 2;
+ row0 = vec_perm(row0, above_right, sl1);
+ row1 = vec_perm(row1, above_right, sl1);
+ }
+}
+#endif
+
+void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t a1 = vec_vsx_ld(16, above);
+ const uint8x16_t above_right = vec_splat(a1, 0);
+ const uint8x16_t b = vec_perm(a0, above_right, sl1);
+ const uint8x16_t c = vec_perm(b, above_right, sl1);
+ uint8x16_t row0 = vec_avg(a0, b);
+ uint8x16_t row1 = avg3(a0, b, c);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 8; i++) {
+ vec_vsx_st(row0, 0, dst);
+ vec_vsx_st(row1, 0, dst + stride);
+ dst += stride * 2;
+ row0 = vec_perm(row0, above_right, sl1);
+ row1 = vec_perm(row1, above_right, sl1);
+ }
+}
+
+void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t a1 = vec_vsx_ld(16, above);
+ const uint8x16_t a2 = vec_vsx_ld(32, above);
+ const uint8x16_t above_right = vec_splat(a2, 0);
+ const uint8x16_t b0 = vec_perm(a0, a1, sl1);
+ const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
+ const uint8x16_t c0 = vec_perm(b0, b1, sl1);
+ const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
+ uint8x16_t row0_0 = vec_avg(a0, b0);
+ uint8x16_t row0_1 = vec_avg(a1, b1);
+ uint8x16_t row1_0 = avg3(a0, b0, c0);
+ uint8x16_t row1_1 = avg3(a1, b1, c1);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 16; i++) {
+ vec_vsx_st(row0_0, 0, dst);
+ vec_vsx_st(row0_1, 16, dst);
+ vec_vsx_st(row1_0, 0, dst + stride);
+ vec_vsx_st(row1_1, 16, dst + stride);
+ dst += stride * 2;
+ row0_0 = vec_perm(row0_0, row0_1, sl1);
+ row0_1 = vec_perm(row0_1, above_right, sl1);
+ row1_0 = vec_perm(row1_0, row1_1, sl1);
+ row1_1 = vec_perm(row1_1, above_right, sl1);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
new file mode 100644
index 0000000000..e99412ecab
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
@@ -0,0 +1,1828 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+#include "vpx_dsp/ppc/inv_txfm_vsx.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
+ 16364, 16364, 16364, 16364 };
+static const int16x8_t cospi1m_v = { -16364, -16364, -16364, -16364,
+ -16364, -16364, -16364, -16364 };
+static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
+ 16305, 16305, 16305, 16305 };
+static const int16x8_t cospi2m_v = { -16305, -16305, -16305, -16305,
+ -16305, -16305, -16305, -16305 };
+static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
+ 16207, 16207, 16207, 16207 };
+static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
+ 16069, 16069, 16069, 16069 };
+static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
+ -16069, -16069, -16069, -16069 };
+static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
+ 15893, 15893, 15893, 15893 };
+static const int16x8_t cospi5m_v = { -15893, -15893, -15893, -15893,
+ -15893, -15893, -15893, -15893 };
+static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
+ 15679, 15679, 15679, 15679 };
+static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
+ 15426, 15426, 15426, 15426 };
+static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
+ 15137, 15137, 15137, 15137 };
+static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
+ -15137, -15137, -15137, -15137 };
+static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
+ 14811, 14811, 14811, 14811 };
+static const int16x8_t cospi9m_v = { -14811, -14811, -14811, -14811,
+ -14811, -14811, -14811, -14811 };
+static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
+ 14449, 14449, 14449, 14449 };
+static const int16x8_t cospi10m_v = { -14449, -14449, -14449, -14449,
+ -14449, -14449, -14449, -14449 };
+static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
+ 14053, 14053, 14053, 14053 };
+static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
+ 13623, 13623, 13623, 13623 };
+static const int16x8_t cospi12m_v = { -13623, -13623, -13623, -13623,
+ -13623, -13623, -13623, -13623 };
+static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
+ 13160, 13160, 13160, 13160 };
+static const int16x8_t cospi13m_v = { -13160, -13160, -13160, -13160,
+ -13160, -13160, -13160, -13160 };
+static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
+ 12665, 12665, 12665, 12665 };
+static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
+ 12140, 12140, 12140, 12140 };
+static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
+ 11585, 11585, 11585, 11585 };
+static const int16x8_t cospi16m_v = { -11585, -11585, -11585, -11585,
+ -11585, -11585, -11585, -11585 };
+static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
+ 11003, 11003, 11003, 11003 };
+static const int16x8_t cospi17m_v = { -11003, -11003, -11003, -11003,
+ -11003, -11003, -11003, -11003 };
+static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
+ 10394, 10394, 10394, 10394 };
+static const int16x8_t cospi18m_v = { -10394, -10394, -10394, -10394,
+ -10394, -10394, -10394, -10394 };
+static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760,
+ 9760, 9760, 9760, 9760 };
+static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102,
+ 9102, 9102, 9102, 9102 };
+static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
+ -9102, -9102, -9102, -9102 };
+static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423,
+ 8423, 8423, 8423, 8423 };
+static const int16x8_t cospi21m_v = { -8423, -8423, -8423, -8423,
+ -8423, -8423, -8423, -8423 };
+static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723,
+ 7723, 7723, 7723, 7723 };
+static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005,
+ 7005, 7005, 7005, 7005 };
+static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270,
+ 6270, 6270, 6270, 6270 };
+static const int16x8_t cospi24m_v = { -6270, -6270, -6270, -6270,
+ -6270, -6270, -6270, -6270 };
+static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520,
+ 5520, 5520, 5520, 5520 };
+static const int16x8_t cospi25m_v = { -5520, -5520, -5520, -5520,
+ -5520, -5520, -5520, -5520 };
+static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756,
+ 4756, 4756, 4756, 4756 };
+static const int16x8_t cospi26m_v = { -4756, -4756, -4756, -4756,
+ -4756, -4756, -4756, -4756 };
+static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981,
+ 3981, 3981, 3981, 3981 };
+static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196,
+ 3196, 3196, 3196, 3196 };
+static const int16x8_t cospi28m_v = { -3196, -3196, -3196, -3196,
+ -3196, -3196, -3196, -3196 };
+static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404,
+ 2404, 2404, 2404, 2404 };
+static const int16x8_t cospi29m_v = { -2404, -2404, -2404, -2404,
+ -2404, -2404, -2404, -2404 };
+static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606,
+ 1606, 1606, 1606, 1606 };
+static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
+
+static const int16x8_t sinpi_1_9_v = { 5283, 5283, 5283, 5283,
+ 5283, 5283, 5283, 5283 };
+static const int16x8_t sinpi_2_9_v = { 9929, 9929, 9929, 9929,
+ 9929, 9929, 9929, 9929 };
+static const int16x8_t sinpi_3_9_v = { 13377, 13377, 13377, 13377,
+ 13377, 13377, 13377, 13377 };
+static const int16x8_t sinpi_4_9_v = { 15212, 15212, 15212, 15212,
+ 15212, 15212, 15212, 15212 };
+
+static uint8x16_t tr8_mask0 = {
+ 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+};
+
+static uint8x16_t tr8_mask1 = {
+ 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
+ 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
+};
+
+#define ROUND_SHIFT_INIT \
+ const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
+ const uint32x4_t shift14 = vec_splat_u32(14);
+
+#define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14);
+
+#define PIXEL_ADD_INIT \
+ int16x8_t add8 = vec_splat_s16(8); \
+ uint16x8_t shift4 = vec_splat_u16(4);
+
+#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4);
+
+#define IDCT4(in0, in1, out0, out1) \
+ t0 = vec_add(in0, in1); \
+ t1 = vec_sub(in0, in1); \
+ tmp16_0 = vec_mergeh(t0, t1); \
+ temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14); \
+ temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14); \
+ \
+ tmp16_0 = vec_mergel(in0, in1); \
+ temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
+ DCT_CONST_ROUND_SHIFT(temp3); \
+ temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \
+ DCT_CONST_ROUND_SHIFT(temp4); \
+ \
+ step0 = vec_packs(temp1, temp2); \
+ step1 = vec_packs(temp4, temp3); \
+ out0 = vec_add(step0, step1); \
+ out1 = vec_sub(step0, step1); \
+ out1 = vec_perm(out1, out1, mask0);
+
+#define PACK_STORE(v0, v1) \
+ tmp16_0 = vec_add(vec_perm(d_u0, d_u1, tr8_mask0), v0); \
+ tmp16_1 = vec_add(vec_perm(d_u2, d_u3, tr8_mask0), v1); \
+ output_v = vec_packsu(tmp16_0, tmp16_1); \
+ \
+ vec_vsx_st(output_v, 0, tmp_dest); \
+ for (i = 0; i < 4; i++) \
+ for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+
+void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest,
+ int stride) {
+ int i, j;
+ uint8x16_t dest0 = vec_vsx_ld(0, dest);
+ uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+ uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+ uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+ uint8x16_t zerov = vec_splat_u8(0);
+ int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
+ int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
+ int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
+ int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+ int16x8_t tmp16_0, tmp16_1;
+ uint8x16_t output_v;
+ uint8_t tmp_dest[16];
+ PIXEL_ADD_INIT;
+
+ PIXEL_ADD4(out[0], in[0]);
+ PIXEL_ADD4(out[1], in[1]);
+
+ PACK_STORE(out[0], out[1]);
+}
+
+void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out) {
+ int32x4_t temp1, temp2, temp3, temp4;
+ int16x8_t step0, step1, tmp16_0;
+ uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
+ 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
+ int16x8_t t0 = vec_mergeh(in[0], in[1]);
+ int16x8_t t1 = vec_mergel(in[0], in[1]);
+ ROUND_SHIFT_INIT
+
+ in[0] = vec_mergeh(t0, t1);
+ in[1] = vec_mergel(t0, t1);
+
+ IDCT4(in[0], in[1], out[0], out[1]);
+}
+
+void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16x8_t in[2], out[2];
+
+ in[0] = load_tran_low(0, input);
+ in[1] = load_tran_low(8 * sizeof(*input), input);
+ // Rows
+ vpx_idct4_vsx(in, out);
+
+ // Columns
+ vpx_idct4_vsx(out, in);
+
+ vpx_round_store4x4_vsx(in, out, dest, stride);
+}
+
+#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ out0 = vec_mergeh(in0, in1); \
+ out1 = vec_mergel(in0, in1); \
+ out2 = vec_mergeh(in2, in3); \
+ out3 = vec_mergel(in2, in3); \
+ out4 = vec_mergeh(in4, in5); \
+ out5 = vec_mergel(in4, in5); \
+ out6 = vec_mergeh(in6, in7); \
+ out7 = vec_mergel(in6, in7); \
+ in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2); \
+ in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2); \
+ in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3); \
+ in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3); \
+ in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6); \
+ in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6); \
+ in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7); \
+ in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7); \
+ out0 = vec_perm(in0, in4, tr8_mask0); \
+ out1 = vec_perm(in0, in4, tr8_mask1); \
+ out2 = vec_perm(in1, in5, tr8_mask0); \
+ out3 = vec_perm(in1, in5, tr8_mask1); \
+ out4 = vec_perm(in2, in6, tr8_mask0); \
+ out5 = vec_perm(in2, in6, tr8_mask1); \
+ out6 = vec_perm(in3, in7, tr8_mask0); \
+ out7 = vec_perm(in3, in7, tr8_mask1);
+
+/* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z
+ * temp2 = step[x] * cospi_z + step[y] * cospi_q */
+#define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \
+ tmp16_0 = vec_mergeh(inpt0, inpt1); \
+ tmp16_1 = vec_mergel(inpt0, inpt1); \
+ temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \
+ temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt0 = vec_packs(temp10, temp11); \
+ temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
+ temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt1 = vec_packs(temp10, temp11);
+
+#define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \
+ tmp16_2 = vec_sub(inpt0, inpt1); \
+ tmp16_3 = vec_add(inpt0, inpt1); \
+ tmp16_0 = vec_mergeh(tmp16_2, tmp16_3); \
+ tmp16_1 = vec_mergel(tmp16_2, tmp16_3); \
+ temp10 = vec_mule(tmp16_0, cospi); \
+ temp11 = vec_mule(tmp16_1, cospi); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt0 = vec_packs(temp10, temp11); \
+ temp10 = vec_mulo(tmp16_0, cospi); \
+ temp11 = vec_mulo(tmp16_1, cospi); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt1 = vec_packs(temp10, temp11);
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7) \
+ /* stage 1 */ \
+ step0 = in0; \
+ step2 = in4; \
+ step1 = in2; \
+ step3 = in6; \
+ \
+ STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v); \
+ STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \
+ \
+ /* stage 2 */ \
+ STEP8_1(step0, step2, in1, in0, cospi16_v); \
+ STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v); \
+ in4 = vec_add(step4, step5); \
+ in5 = vec_sub(step4, step5); \
+ in6 = vec_sub(step7, step6); \
+ in7 = vec_add(step6, step7); \
+ \
+ /* stage 3 */ \
+ step0 = vec_add(in0, in3); \
+ step1 = vec_add(in1, in2); \
+ step2 = vec_sub(in1, in2); \
+ step3 = vec_sub(in0, in3); \
+ step4 = in4; \
+ STEP8_1(in6, in5, step5, step6, cospi16_v); \
+ step7 = in7; \
+ \
+ /* stage 4 */ \
+ in0 = vec_add(step0, step7); \
+ in1 = vec_add(step1, step6); \
+ in2 = vec_add(step2, step5); \
+ in3 = vec_add(step3, step4); \
+ in4 = vec_sub(step3, step4); \
+ in5 = vec_sub(step2, step5); \
+ in6 = vec_sub(step1, step6); \
+ in7 = vec_sub(step0, step7);
+
+#define PIXEL_ADD(in, out, add, shiftx) \
+ out = vec_add(vec_sra(vec_add(in, add), shiftx), out);
+
+void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out) {
+ int16x8_t step0, step1, step2, step3, step4, step5, step6, step7;
+ int16x8_t tmp16_0, tmp16_1, tmp16_2, tmp16_3;
+ int32x4_t temp10, temp11;
+ ROUND_SHIFT_INIT;
+
+ TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0],
+ out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
+
+ IDCT8(out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
+}
+
+void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride) {
+ uint8x16_t zerov = vec_splat_u8(0);
+ uint8x16_t dest0 = vec_vsx_ld(0, dest);
+ uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+ uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+ uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+ uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest);
+ uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest);
+ uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest);
+ uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest);
+ int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
+ int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
+ int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
+ int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+ int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov);
+ int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov);
+ int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov);
+ int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov);
+ int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1));
+ uint16x8_t shift5 = vec_splat_u16(5);
+ uint8x16_t output0, output1, output2, output3;
+
+ PIXEL_ADD(in[0], d_u0, add, shift5);
+ PIXEL_ADD(in[1], d_u1, add, shift5);
+ PIXEL_ADD(in[2], d_u2, add, shift5);
+ PIXEL_ADD(in[3], d_u3, add, shift5);
+ PIXEL_ADD(in[4], d_u4, add, shift5);
+ PIXEL_ADD(in[5], d_u5, add, shift5);
+ PIXEL_ADD(in[6], d_u6, add, shift5);
+ PIXEL_ADD(in[7], d_u7, add, shift5);
+ output0 = vec_packsu(d_u0, d_u1);
+ output1 = vec_packsu(d_u2, d_u3);
+ output2 = vec_packsu(d_u4, d_u5);
+ output3 = vec_packsu(d_u6, d_u7);
+
+ vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest);
+ vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest);
+ vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest);
+ vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest);
+ vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest);
+ vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest);
+ vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest);
+ vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest);
+}
+
+void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16x8_t src[8], tmp[8];
+
+ src[0] = load_tran_low(0, input);
+ src[1] = load_tran_low(8 * sizeof(*input), input);
+ src[2] = load_tran_low(16 * sizeof(*input), input);
+ src[3] = load_tran_low(24 * sizeof(*input), input);
+ src[4] = load_tran_low(32 * sizeof(*input), input);
+ src[5] = load_tran_low(40 * sizeof(*input), input);
+ src[6] = load_tran_low(48 * sizeof(*input), input);
+ src[7] = load_tran_low(56 * sizeof(*input), input);
+
+ vpx_idct8_vsx(src, tmp);
+ vpx_idct8_vsx(tmp, src);
+
+ vpx_round_store8x8_vsx(src, dest, stride);
+}
+
+#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \
+ tmp16_0 = vec_mergeh(inpt0, inpt1); \
+ tmp16_1 = vec_mergel(inpt0, inpt1); \
+ temp10 = vec_mule(tmp16_0, cospi); \
+ temp11 = vec_mule(tmp16_1, cospi); \
+ temp20 = vec_mulo(tmp16_0, cospi); \
+ temp21 = vec_mulo(tmp16_1, cospi); \
+ temp30 = vec_sub(temp10, temp20); \
+ temp10 = vec_add(temp10, temp20); \
+ temp20 = vec_sub(temp11, temp21); \
+ temp21 = vec_add(temp11, temp21); \
+ DCT_CONST_ROUND_SHIFT(temp30); \
+ DCT_CONST_ROUND_SHIFT(temp20); \
+ outpt0 = vec_packs(temp30, temp20); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp21); \
+ outpt1 = vec_packs(temp10, temp21);
+
+#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB, \
+ inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6, \
+ out7, out8, out9, outA, outB, outC, outD, outE, outF) \
+ /* stage 1 */ \
+ /* out0 = in0; */ \
+ out1 = in8; \
+ out2 = in4; \
+ out3 = inC; \
+ out4 = in2; \
+ out5 = inA; \
+ out6 = in6; \
+ out7 = inE; \
+ out8 = in1; \
+ out9 = in9; \
+ outA = in5; \
+ outB = inD; \
+ outC = in3; \
+ outD = inB; \
+ outE = in7; \
+ outF = inF; \
+ \
+ /* stage 2 */ \
+ /* in0 = out0; */ \
+ in1 = out1; \
+ in2 = out2; \
+ in3 = out3; \
+ in4 = out4; \
+ in5 = out5; \
+ in6 = out6; \
+ in7 = out7; \
+ \
+ STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v); \
+ STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v); \
+ STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v); \
+ STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v); \
+ \
+ /* stage 3 */ \
+ out0 = in0; \
+ out1 = in1; \
+ out2 = in2; \
+ out3 = in3; \
+ \
+ STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v); \
+ STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v); \
+ \
+ out8 = vec_add(in8, in9); \
+ out9 = vec_sub(in8, in9); \
+ outA = vec_sub(inB, inA); \
+ outB = vec_add(inA, inB); \
+ outC = vec_add(inC, inD); \
+ outD = vec_sub(inC, inD); \
+ outE = vec_sub(inF, inE); \
+ outF = vec_add(inE, inF); \
+ \
+ /* stage 4 */ \
+ STEP16_1(out0, out1, in1, in0, cospi16_v); \
+ STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v); \
+ in4 = vec_add(out4, out5); \
+ in5 = vec_sub(out4, out5); \
+ in6 = vec_sub(out7, out6); \
+ in7 = vec_add(out6, out7); \
+ \
+ in8 = out8; \
+ inF = outF; \
+ tmp16_0 = vec_mergeh(out9, outE); \
+ tmp16_1 = vec_mergel(out9, outE); \
+ temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
+ temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ in9 = vec_packs(temp10, temp11); \
+ temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
+ temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ inE = vec_packs(temp10, temp11); \
+ \
+ tmp16_0 = vec_mergeh(outA, outD); \
+ tmp16_1 = vec_mergel(outA, outD); \
+ temp10 = \
+ vec_sub(vec_mule(tmp16_0, cospi24m_v), vec_mulo(tmp16_0, cospi8_v)); \
+ temp11 = \
+ vec_sub(vec_mule(tmp16_1, cospi24m_v), vec_mulo(tmp16_1, cospi8_v)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ inA = vec_packs(temp10, temp11); \
+ temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
+ temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ inD = vec_packs(temp10, temp11); \
+ \
+ inB = outB; \
+ inC = outC; \
+ \
+ /* stage 5 */ \
+ out0 = vec_add(in0, in3); \
+ out1 = vec_add(in1, in2); \
+ out2 = vec_sub(in1, in2); \
+ out3 = vec_sub(in0, in3); \
+ out4 = in4; \
+ STEP16_1(in6, in5, out5, out6, cospi16_v); \
+ out7 = in7; \
+ \
+ out8 = vec_add(in8, inB); \
+ out9 = vec_add(in9, inA); \
+ outA = vec_sub(in9, inA); \
+ outB = vec_sub(in8, inB); \
+ outC = vec_sub(inF, inC); \
+ outD = vec_sub(inE, inD); \
+ outE = vec_add(inD, inE); \
+ outF = vec_add(inC, inF); \
+ \
+ /* stage 6 */ \
+ in0 = vec_add(out0, out7); \
+ in1 = vec_add(out1, out6); \
+ in2 = vec_add(out2, out5); \
+ in3 = vec_add(out3, out4); \
+ in4 = vec_sub(out3, out4); \
+ in5 = vec_sub(out2, out5); \
+ in6 = vec_sub(out1, out6); \
+ in7 = vec_sub(out0, out7); \
+ in8 = out8; \
+ in9 = out9; \
+ STEP16_1(outD, outA, inA, inD, cospi16_v); \
+ STEP16_1(outC, outB, inB, inC, cospi16_v); \
+ inE = outE; \
+ inF = outF; \
+ \
+ /* stage 7 */ \
+ out0 = vec_add(in0, inF); \
+ out1 = vec_add(in1, inE); \
+ out2 = vec_add(in2, inD); \
+ out3 = vec_add(in3, inC); \
+ out4 = vec_add(in4, inB); \
+ out5 = vec_add(in5, inA); \
+ out6 = vec_add(in6, in9); \
+ out7 = vec_add(in7, in8); \
+ out8 = vec_sub(in7, in8); \
+ out9 = vec_sub(in6, in9); \
+ outA = vec_sub(in5, inA); \
+ outB = vec_sub(in4, inB); \
+ outC = vec_sub(in3, inC); \
+ outD = vec_sub(in2, inD); \
+ outE = vec_sub(in1, inE); \
+ outF = vec_sub(in0, inF);
+
+#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \
+ d_uh = (int16x8_t)vec_mergeh(dst, zerov); \
+ d_ul = (int16x8_t)vec_mergel(dst, zerov); \
+ PIXEL_ADD(in0, d_uh, add, shift6); \
+ PIXEL_ADD(in1, d_ul, add, shift6); \
+ vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest);
+
+static void half_idct16x8_vsx(int16x8_t *src) {
+ int16x8_t tmp0[8], tmp1[8];
+ int32x4_t temp10, temp11, temp20, temp21, temp30;
+ int16x8_t tmp16_0, tmp16_1;
+ ROUND_SHIFT_INIT;
+
+ TRANSPOSE8x8(src[0], src[2], src[4], src[6], src[8], src[10], src[12],
+ src[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+ tmp0[6], tmp0[7]);
+ TRANSPOSE8x8(src[1], src[3], src[5], src[7], src[9], src[11], src[13],
+ src[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+ tmp1[6], tmp1[7]);
+ IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
+ tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
+ src[0], src[2], src[4], src[6], src[8], src[10], src[12], src[14],
+ src[1], src[3], src[5], src[7], src[9], src[11], src[13], src[15]);
+}
+
+void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1) {
+ int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8];
+ int32x4_t temp10, temp11, temp20, temp21, temp30;
+ int16x8_t tmp16_0, tmp16_1;
+ ROUND_SHIFT_INIT;
+
+ TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+ src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+ tmp0[6], tmp0[7]);
+ TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+ src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+ tmp1[6], tmp1[7]);
+ TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
+ src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5],
+ tmp2[6], tmp2[7]);
+ TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
+ src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5],
+ tmp3[6], tmp3[7]);
+
+ IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
+ tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
+ src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+ src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10],
+ src1[12], src1[14]);
+
+ IDCT16(tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7],
+ tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7],
+ src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+ src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11],
+ src1[13], src1[15]);
+}
+
+void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest,
+ int stride) {
+ uint8x16_t destv[16];
+ int16x8_t d_uh, d_ul;
+ uint8x16_t zerov = vec_splat_u8(0);
+ uint16x8_t shift6 = vec_splat_u16(6);
+ int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
+
+ // load dest
+ LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, destv);
+
+ PIXEL_ADD_STORE16(src0[0], src0[1], destv[0], 0);
+ PIXEL_ADD_STORE16(src0[2], src0[3], destv[1], stride);
+ PIXEL_ADD_STORE16(src0[4], src0[5], destv[2], 2 * stride);
+ PIXEL_ADD_STORE16(src0[6], src0[7], destv[3], 3 * stride);
+ PIXEL_ADD_STORE16(src0[8], src0[9], destv[4], 4 * stride);
+ PIXEL_ADD_STORE16(src0[10], src0[11], destv[5], 5 * stride);
+ PIXEL_ADD_STORE16(src0[12], src0[13], destv[6], 6 * stride);
+ PIXEL_ADD_STORE16(src0[14], src0[15], destv[7], 7 * stride);
+
+ PIXEL_ADD_STORE16(src1[0], src1[1], destv[8], 8 * stride);
+ PIXEL_ADD_STORE16(src1[2], src1[3], destv[9], 9 * stride);
+ PIXEL_ADD_STORE16(src1[4], src1[5], destv[10], 10 * stride);
+ PIXEL_ADD_STORE16(src1[6], src1[7], destv[11], 11 * stride);
+ PIXEL_ADD_STORE16(src1[8], src1[9], destv[12], 12 * stride);
+ PIXEL_ADD_STORE16(src1[10], src1[11], destv[13], 13 * stride);
+ PIXEL_ADD_STORE16(src1[12], src1[13], destv[14], 14 * stride);
+ PIXEL_ADD_STORE16(src1[14], src1[15], destv[15], 15 * stride);
+}
+void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16x8_t src0[16], src1[16];
+ int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8];
+ int32x4_t temp10, temp11, temp20, temp21, temp30;
+ int16x8_t tmp16_0, tmp16_1;
+ ROUND_SHIFT_INIT;
+
+ LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src0);
+ LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
+ 8 * sizeof(*input), src1);
+
+ // transform rows
+ // transform the upper half of 16x16 matrix
+ half_idct16x8_vsx(src0);
+ TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+ src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+ tmp0[6], tmp0[7]);
+ TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+ src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+ tmp1[6], tmp1[7]);
+
+ // transform the lower half of 16x16 matrix
+ half_idct16x8_vsx(src1);
+ TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
+ src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5],
+ tmp2[6], tmp2[7]);
+ TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
+ src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5],
+ tmp3[6], tmp3[7]);
+
+ // transform columns
+ // left half first
+ IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
+ tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7],
+ src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+ src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10],
+ src1[12], src1[14]);
+ // right half
+ IDCT16(tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
+ tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7],
+ src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+ src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11],
+ src1[13], src1[15]);
+
+ vpx_round_store16x16_vsx(src0, src1, dest, stride);
+}
+
+#define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \
+ in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \
+ in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \
+ in71, in72, in73, offset) \
+ /* load the first row from the 8x32 block*/ \
+ in00 = load(offset, input); \
+ in01 = load(offset + 16, input); \
+ in02 = load(offset + 2 * 16, input); \
+ in03 = load(offset + 3 * 16, input); \
+ \
+ in10 = load(offset + 4 * 16, input); \
+ in11 = load(offset + 5 * 16, input); \
+ in12 = load(offset + 6 * 16, input); \
+ in13 = load(offset + 7 * 16, input); \
+ \
+ in20 = load(offset + 8 * 16, input); \
+ in21 = load(offset + 9 * 16, input); \
+ in22 = load(offset + 10 * 16, input); \
+ in23 = load(offset + 11 * 16, input); \
+ \
+ in30 = load(offset + 12 * 16, input); \
+ in31 = load(offset + 13 * 16, input); \
+ in32 = load(offset + 14 * 16, input); \
+ in33 = load(offset + 15 * 16, input); \
+ \
+ in40 = load(offset + 16 * 16, input); \
+ in41 = load(offset + 17 * 16, input); \
+ in42 = load(offset + 18 * 16, input); \
+ in43 = load(offset + 19 * 16, input); \
+ \
+ in50 = load(offset + 20 * 16, input); \
+ in51 = load(offset + 21 * 16, input); \
+ in52 = load(offset + 22 * 16, input); \
+ in53 = load(offset + 23 * 16, input); \
+ \
+ in60 = load(offset + 24 * 16, input); \
+ in61 = load(offset + 25 * 16, input); \
+ in62 = load(offset + 26 * 16, input); \
+ in63 = load(offset + 27 * 16, input); \
+ \
+ /* load the last row from the 8x32 block*/ \
+ in70 = load(offset + 28 * 16, input); \
+ in71 = load(offset + 29 * 16, input); \
+ in72 = load(offset + 30 * 16, input); \
+ in73 = load(offset + 31 * 16, input);
+
+/* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z
+ * temp2 = step[x] * cospi_z + step[y] * cospi_q */
+#define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \
+ tmp16_0 = vec_mergeh(inpt0, inpt1); \
+ tmp16_1 = vec_mergel(inpt0, inpt1); \
+ temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \
+ temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt0 = vec_packs(temp10, temp11); \
+ temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
+ temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt1 = vec_packs(temp10, temp11);
+
+/* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z
+ * temp2 = -step[x] * cospi_z + step[y] * cospi_q */
+#define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m) \
+ tmp16_0 = vec_mergeh(inpt0, inpt1); \
+ tmp16_1 = vec_mergel(inpt0, inpt1); \
+ temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \
+ temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt0 = vec_packs(temp10, temp11); \
+ temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1)); \
+ temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1)); \
+ DCT_CONST_ROUND_SHIFT(temp10); \
+ DCT_CONST_ROUND_SHIFT(temp11); \
+ outpt1 = vec_packs(temp10, temp11);
+
+#define IDCT32(in0, in1, in2, in3, out) \
+ \
+ /* stage 1 */ \
+ /* out[0][0] = in[0][0]; */ \
+ out[0][1] = in2[0]; \
+ out[0][2] = in1[0]; \
+ out[0][3] = in3[0]; \
+ out[0][4] = in0[4]; \
+ out[0][5] = in2[4]; \
+ out[0][6] = in1[4]; \
+ out[0][7] = in3[4]; \
+ out[1][0] = in0[2]; \
+ out[1][1] = in2[2]; \
+ out[1][2] = in1[2]; \
+ out[1][3] = in3[2]; \
+ out[1][4] = in0[6]; \
+ out[1][5] = in2[6]; \
+ out[1][6] = in1[6]; \
+ out[1][7] = in3[6]; \
+ \
+ STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v); \
+ STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \
+ STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v); \
+ STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v); \
+ STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v); \
+ STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \
+ STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \
+ STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v); \
+ \
+ /* stage 2 */ \
+ /* in0[0] = out[0][0]; */ \
+ in0[1] = out[0][1]; \
+ in0[2] = out[0][2]; \
+ in0[3] = out[0][3]; \
+ in0[4] = out[0][4]; \
+ in0[5] = out[0][5]; \
+ in0[6] = out[0][6]; \
+ in0[7] = out[0][7]; \
+ \
+ STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v); \
+ STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \
+ STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \
+ STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v); \
+ \
+ in2[0] = vec_add(out[2][0], out[2][1]); \
+ in2[1] = vec_sub(out[2][0], out[2][1]); \
+ in2[2] = vec_sub(out[2][3], out[2][2]); \
+ in2[3] = vec_add(out[2][3], out[2][2]); \
+ in2[4] = vec_add(out[2][4], out[2][5]); \
+ in2[5] = vec_sub(out[2][4], out[2][5]); \
+ in2[6] = vec_sub(out[2][7], out[2][6]); \
+ in2[7] = vec_add(out[2][7], out[2][6]); \
+ in3[0] = vec_add(out[3][0], out[3][1]); \
+ in3[1] = vec_sub(out[3][0], out[3][1]); \
+ in3[2] = vec_sub(out[3][3], out[3][2]); \
+ in3[3] = vec_add(out[3][3], out[3][2]); \
+ in3[4] = vec_add(out[3][4], out[3][5]); \
+ in3[5] = vec_sub(out[3][4], out[3][5]); \
+ in3[6] = vec_sub(out[3][7], out[3][6]); \
+ in3[7] = vec_add(out[3][6], out[3][7]); \
+ \
+ /* stage 3 */ \
+ out[0][0] = in0[0]; \
+ out[0][1] = in0[1]; \
+ out[0][2] = in0[2]; \
+ out[0][3] = in0[3]; \
+ \
+ STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v); \
+ STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \
+ \
+ out[1][0] = vec_add(in1[0], in1[1]); \
+ out[1][1] = vec_sub(in1[0], in1[1]); \
+ out[1][2] = vec_sub(in1[3], in1[2]); \
+ out[1][3] = vec_add(in1[2], in1[3]); \
+ out[1][4] = vec_add(in1[4], in1[5]); \
+ out[1][5] = vec_sub(in1[4], in1[5]); \
+ out[1][6] = vec_sub(in1[7], in1[6]); \
+ out[1][7] = vec_add(in1[6], in1[7]); \
+ \
+ out[2][0] = in2[0]; \
+ out[3][7] = in3[7]; \
+ STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v); \
+ STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v, \
+ cospi4m_v); \
+ out[2][3] = in2[3]; \
+ out[2][4] = in2[4]; \
+ STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v); \
+ STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \
+ cospi20m_v); \
+ out[2][7] = in2[7]; \
+ out[3][0] = in3[0]; \
+ out[3][3] = in3[3]; \
+ out[3][4] = in3[4]; \
+ \
+ /* stage 4 */ \
+ STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v); \
+ STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v); \
+ in0[4] = vec_add(out[0][4], out[0][5]); \
+ in0[5] = vec_sub(out[0][4], out[0][5]); \
+ in0[6] = vec_sub(out[0][7], out[0][6]); \
+ in0[7] = vec_add(out[0][7], out[0][6]); \
+ \
+ in1[0] = out[1][0]; \
+ in1[7] = out[1][7]; \
+ STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v); \
+ STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v, \
+ cospi8m_v); \
+ in1[3] = out[1][3]; \
+ in1[4] = out[1][4]; \
+ \
+ in2[0] = vec_add(out[2][0], out[2][3]); \
+ in2[1] = vec_add(out[2][1], out[2][2]); \
+ in2[2] = vec_sub(out[2][1], out[2][2]); \
+ in2[3] = vec_sub(out[2][0], out[2][3]); \
+ in2[4] = vec_sub(out[2][7], out[2][4]); \
+ in2[5] = vec_sub(out[2][6], out[2][5]); \
+ in2[6] = vec_add(out[2][5], out[2][6]); \
+ in2[7] = vec_add(out[2][4], out[2][7]); \
+ \
+ in3[0] = vec_add(out[3][0], out[3][3]); \
+ in3[1] = vec_add(out[3][1], out[3][2]); \
+ in3[2] = vec_sub(out[3][1], out[3][2]); \
+ in3[3] = vec_sub(out[3][0], out[3][3]); \
+ in3[4] = vec_sub(out[3][7], out[3][4]); \
+ in3[5] = vec_sub(out[3][6], out[3][5]); \
+ in3[6] = vec_add(out[3][5], out[3][6]); \
+ in3[7] = vec_add(out[3][4], out[3][7]); \
+ \
+ /* stage 5 */ \
+ out[0][0] = vec_add(in0[0], in0[3]); \
+ out[0][1] = vec_add(in0[1], in0[2]); \
+ out[0][2] = vec_sub(in0[1], in0[2]); \
+ out[0][3] = vec_sub(in0[0], in0[3]); \
+ out[0][4] = in0[4]; \
+ STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v); \
+ out[0][7] = in0[7]; \
+ \
+ out[1][0] = vec_add(in1[0], in1[3]); \
+ out[1][1] = vec_add(in1[1], in1[2]); \
+ out[1][2] = vec_sub(in1[1], in1[2]); \
+ out[1][3] = vec_sub(in1[0], in1[3]); \
+ out[1][4] = vec_sub(in1[7], in1[4]); \
+ out[1][5] = vec_sub(in1[6], in1[5]); \
+ out[1][6] = vec_add(in1[5], in1[6]); \
+ out[1][7] = vec_add(in1[4], in1[7]); \
+ \
+ out[2][0] = in2[0]; \
+ out[2][1] = in2[1]; \
+ STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v); \
+ STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v); \
+ STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v, \
+ cospi8m_v); \
+ STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v, \
+ cospi8m_v); \
+ out[2][6] = in2[6]; \
+ out[2][7] = in2[7]; \
+ out[3][0] = in3[0]; \
+ out[3][1] = in3[1]; \
+ out[3][6] = in3[6]; \
+ out[3][7] = in3[7]; \
+ \
+ /* stage 6 */ \
+ in0[0] = vec_add(out[0][0], out[0][7]); \
+ in0[1] = vec_add(out[0][1], out[0][6]); \
+ in0[2] = vec_add(out[0][2], out[0][5]); \
+ in0[3] = vec_add(out[0][3], out[0][4]); \
+ in0[4] = vec_sub(out[0][3], out[0][4]); \
+ in0[5] = vec_sub(out[0][2], out[0][5]); \
+ in0[6] = vec_sub(out[0][1], out[0][6]); \
+ in0[7] = vec_sub(out[0][0], out[0][7]); \
+ in1[0] = out[1][0]; \
+ in1[1] = out[1][1]; \
+ STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v); \
+ STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v); \
+ in1[6] = out[1][6]; \
+ in1[7] = out[1][7]; \
+ \
+ in2[0] = vec_add(out[2][0], out[2][7]); \
+ in2[1] = vec_add(out[2][1], out[2][6]); \
+ in2[2] = vec_add(out[2][2], out[2][5]); \
+ in2[3] = vec_add(out[2][3], out[2][4]); \
+ in2[4] = vec_sub(out[2][3], out[2][4]); \
+ in2[5] = vec_sub(out[2][2], out[2][5]); \
+ in2[6] = vec_sub(out[2][1], out[2][6]); \
+ in2[7] = vec_sub(out[2][0], out[2][7]); \
+ \
+ in3[0] = vec_sub(out[3][7], out[3][0]); \
+ in3[1] = vec_sub(out[3][6], out[3][1]); \
+ in3[2] = vec_sub(out[3][5], out[3][2]); \
+ in3[3] = vec_sub(out[3][4], out[3][3]); \
+ in3[4] = vec_add(out[3][4], out[3][3]); \
+ in3[5] = vec_add(out[3][5], out[3][2]); \
+ in3[6] = vec_add(out[3][6], out[3][1]); \
+ in3[7] = vec_add(out[3][7], out[3][0]); \
+ \
+ /* stage 7 */ \
+ out[0][0] = vec_add(in0[0], in1[7]); \
+ out[0][1] = vec_add(in0[1], in1[6]); \
+ out[0][2] = vec_add(in0[2], in1[5]); \
+ out[0][3] = vec_add(in0[3], in1[4]); \
+ out[0][4] = vec_add(in0[4], in1[3]); \
+ out[0][5] = vec_add(in0[5], in1[2]); \
+ out[0][6] = vec_add(in0[6], in1[1]); \
+ out[0][7] = vec_add(in0[7], in1[0]); \
+ out[1][0] = vec_sub(in0[7], in1[0]); \
+ out[1][1] = vec_sub(in0[6], in1[1]); \
+ out[1][2] = vec_sub(in0[5], in1[2]); \
+ out[1][3] = vec_sub(in0[4], in1[3]); \
+ out[1][4] = vec_sub(in0[3], in1[4]); \
+ out[1][5] = vec_sub(in0[2], in1[5]); \
+ out[1][6] = vec_sub(in0[1], in1[6]); \
+ out[1][7] = vec_sub(in0[0], in1[7]); \
+ \
+ out[2][0] = in2[0]; \
+ out[2][1] = in2[1]; \
+ out[2][2] = in2[2]; \
+ out[2][3] = in2[3]; \
+ STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v); \
+ STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v); \
+ STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v); \
+ STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v); \
+ out[3][4] = in3[4]; \
+ out[3][5] = in3[5]; \
+ out[3][6] = in3[6]; \
+ out[3][7] = in3[7]; \
+ \
+ /* final */ \
+ in0[0] = vec_add(out[0][0], out[3][7]); \
+ in0[1] = vec_add(out[0][1], out[3][6]); \
+ in0[2] = vec_add(out[0][2], out[3][5]); \
+ in0[3] = vec_add(out[0][3], out[3][4]); \
+ in0[4] = vec_add(out[0][4], out[3][3]); \
+ in0[5] = vec_add(out[0][5], out[3][2]); \
+ in0[6] = vec_add(out[0][6], out[3][1]); \
+ in0[7] = vec_add(out[0][7], out[3][0]); \
+ in1[0] = vec_add(out[1][0], out[2][7]); \
+ in1[1] = vec_add(out[1][1], out[2][6]); \
+ in1[2] = vec_add(out[1][2], out[2][5]); \
+ in1[3] = vec_add(out[1][3], out[2][4]); \
+ in1[4] = vec_add(out[1][4], out[2][3]); \
+ in1[5] = vec_add(out[1][5], out[2][2]); \
+ in1[6] = vec_add(out[1][6], out[2][1]); \
+ in1[7] = vec_add(out[1][7], out[2][0]); \
+ in2[0] = vec_sub(out[1][7], out[2][0]); \
+ in2[1] = vec_sub(out[1][6], out[2][1]); \
+ in2[2] = vec_sub(out[1][5], out[2][2]); \
+ in2[3] = vec_sub(out[1][4], out[2][3]); \
+ in2[4] = vec_sub(out[1][3], out[2][4]); \
+ in2[5] = vec_sub(out[1][2], out[2][5]); \
+ in2[6] = vec_sub(out[1][1], out[2][6]); \
+ in2[7] = vec_sub(out[1][0], out[2][7]); \
+ in3[0] = vec_sub(out[0][7], out[3][0]); \
+ in3[1] = vec_sub(out[0][6], out[3][1]); \
+ in3[2] = vec_sub(out[0][5], out[3][2]); \
+ in3[3] = vec_sub(out[0][4], out[3][3]); \
+ in3[4] = vec_sub(out[0][3], out[3][4]); \
+ in3[5] = vec_sub(out[0][2], out[3][5]); \
+ in3[6] = vec_sub(out[0][1], out[3][6]); \
+ in3[7] = vec_sub(out[0][0], out[3][7]);
+
+// NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row,
+// does not transpose rows
+#define TRANSPOSE_8x32(in, out) \
+ /* transpose 4 of 8x8 blocks */ \
+ TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5], \
+ in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \
+ out[0][4], out[0][5], out[0][6], out[0][7]); \
+ TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5], \
+ in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \
+ out[1][4], out[1][5], out[1][6], out[1][7]); \
+ TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5], \
+ in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \
+ out[2][4], out[2][5], out[2][6], out[2][7]); \
+ TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5], \
+ in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \
+ out[3][4], out[3][5], out[3][6], out[3][7]);
+
+#define PIXEL_ADD_STORE32(in0, in1, in2, in3, step) \
+ dst = vec_vsx_ld((step)*stride, dest); \
+ d_uh = (int16x8_t)vec_mergeh(dst, zerov); \
+ d_ul = (int16x8_t)vec_mergel(dst, zerov); \
+ PIXEL_ADD(in0, d_uh, add, shift6); \
+ PIXEL_ADD(in1, d_ul, add, shift6); \
+ vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \
+ dst = vec_vsx_ld((step)*stride + 16, dest); \
+ d_uh = (int16x8_t)vec_mergeh(dst, zerov); \
+ d_ul = (int16x8_t)vec_mergel(dst, zerov); \
+ PIXEL_ADD(in2, d_uh, add, shift6); \
+ PIXEL_ADD(in3, d_ul, add, shift6); \
+ vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest);
+
+#define ADD_STORE_BLOCK(in, offset) \
+ PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], (offset) + 0); \
+ PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], (offset) + 1); \
+ PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], (offset) + 2); \
+ PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], (offset) + 3); \
+ PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], (offset) + 4); \
+ PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], (offset) + 5); \
+ PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], (offset) + 6); \
+ PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], (offset) + 7);
+
+void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8];
+ int16x8_t tmp16_0, tmp16_1;
+ int32x4_t temp10, temp11, temp20, temp21, temp30;
+ uint8x16_t dst;
+ int16x8_t d_uh, d_ul;
+ int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
+ uint16x8_t shift6 = vec_splat_u16(6);
+ uint8x16_t zerov = vec_splat_u8(0);
+
+ ROUND_SHIFT_INIT;
+
+ LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0],
+ src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2],
+ src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3],
+ src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4],
+ src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5],
+ src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7],
+ src0[1][7], src0[2][7], src0[3][7], 0);
+ // Rows
+ // transpose the first row of 8x8 blocks
+ TRANSPOSE_8x32(src0, tmp);
+ // transform the 32x8 column
+ IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0);
+ TRANSPOSE_8x32(tmp, src0);
+
+ LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0],
+ src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2],
+ src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3],
+ src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4],
+ src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5],
+ src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7],
+ src1[1][7], src1[2][7], src1[3][7], 512);
+ TRANSPOSE_8x32(src1, tmp);
+ IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1);
+ TRANSPOSE_8x32(tmp, src1);
+
+ LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0],
+ src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2],
+ src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3],
+ src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4],
+ src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5],
+ src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7],
+ src2[1][7], src2[2][7], src2[3][7], 1024);
+ TRANSPOSE_8x32(src2, tmp);
+ IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2);
+ TRANSPOSE_8x32(tmp, src2);
+
+ LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0],
+ src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2],
+ src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3],
+ src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4],
+ src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5],
+ src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7],
+ src3[1][7], src3[2][7], src3[3][7], 1536);
+ TRANSPOSE_8x32(src3, tmp);
+ IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3);
+ TRANSPOSE_8x32(tmp, src3);
+
+ // Columns
+ IDCT32(src0[0], src1[0], src2[0], src3[0], tmp);
+ IDCT32(src0[1], src1[1], src2[1], src3[1], tmp);
+ IDCT32(src0[2], src1[2], src2[2], src3[2], tmp);
+ IDCT32(src0[3], src1[3], src2[3], src3[3], tmp);
+
+ ADD_STORE_BLOCK(src0, 0);
+ ADD_STORE_BLOCK(src1, 8);
+ ADD_STORE_BLOCK(src2, 16);
+ ADD_STORE_BLOCK(src3, 24);
+}
+
+#define TRANSFORM_COLS \
+ v32_a = vec_add(v32_a, v32_c); \
+ v32_d = vec_sub(v32_d, v32_b); \
+ v32_e = vec_sub(v32_a, v32_d); \
+ v32_e = vec_sra(v32_e, one); \
+ v32_b = vec_sub(v32_e, v32_b); \
+ v32_c = vec_sub(v32_e, v32_c); \
+ v32_a = vec_sub(v32_a, v32_b); \
+ v32_d = vec_add(v32_d, v32_c); \
+ v_a = vec_packs(v32_a, v32_b); \
+ v_c = vec_packs(v32_c, v32_d);
+
+#define TRANSPOSE_WHT \
+ tmp_a = vec_mergeh(v_a, v_c); \
+ tmp_c = vec_mergel(v_a, v_c); \
+ v_a = vec_mergeh(tmp_a, tmp_c); \
+ v_c = vec_mergel(tmp_a, tmp_c);
+
+void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16x8_t v_a = load_tran_low(0, input);
+ int16x8_t v_c = load_tran_low(8 * sizeof(*input), input);
+ int16x8_t tmp_a, tmp_c;
+ uint16x8_t two = vec_splat_u16(2);
+ uint32x4_t one = vec_splat_u32(1);
+ int16x8_t tmp16_0, tmp16_1;
+ int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e;
+ uint8x16_t dest0 = vec_vsx_ld(0, dest);
+ uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+ uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+ uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+ int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0);
+ int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1);
+ int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2);
+ int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3);
+ uint8x16_t output_v;
+ uint8_t tmp_dest[16];
+ int i, j;
+
+ v_a = vec_sra(v_a, two);
+ v_c = vec_sra(v_c, two);
+
+ TRANSPOSE_WHT;
+
+ v32_a = vec_unpackh(v_a);
+ v32_c = vec_unpackl(v_a);
+
+ v32_d = vec_unpackh(v_c);
+ v32_b = vec_unpackl(v_c);
+
+ TRANSFORM_COLS;
+
+ TRANSPOSE_WHT;
+
+ v32_a = vec_unpackh(v_a);
+ v32_c = vec_unpackl(v_a);
+ v32_d = vec_unpackh(v_c);
+ v32_b = vec_unpackl(v_c);
+
+ TRANSFORM_COLS;
+
+ PACK_STORE(v_a, v_c);
+}
+
+void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out) {
+ int16x8_t sinpi_1_3_v, sinpi_4_2_v, sinpi_2_3_v, sinpi_1_4_v, sinpi_12_n3_v;
+ int32x4_t v_v[5], u_v[4];
+ int32x4_t zerov = vec_splat_s32(0);
+ int16x8_t tmp0, tmp1;
+ int16x8_t zero16v = vec_splat_s16(0);
+ uint32x4_t shift16 = vec_sl(vec_splat_u32(8), vec_splat_u32(1));
+ ROUND_SHIFT_INIT;
+
+ sinpi_1_3_v = vec_mergel(sinpi_1_9_v, sinpi_3_9_v);
+ sinpi_4_2_v = vec_mergel(sinpi_4_9_v, sinpi_2_9_v);
+ sinpi_2_3_v = vec_mergel(sinpi_2_9_v, sinpi_3_9_v);
+ sinpi_1_4_v = vec_mergel(sinpi_1_9_v, sinpi_4_9_v);
+ sinpi_12_n3_v = vec_mergel(vec_add(sinpi_1_9_v, sinpi_2_9_v),
+ vec_sub(zero16v, sinpi_3_9_v));
+
+ tmp0 = (int16x8_t)vec_mergeh((int32x4_t)in[0], (int32x4_t)in[1]);
+ tmp1 = (int16x8_t)vec_mergel((int32x4_t)in[0], (int32x4_t)in[1]);
+ in[0] = (int16x8_t)vec_mergeh((int32x4_t)tmp0, (int32x4_t)tmp1);
+ in[1] = (int16x8_t)vec_mergel((int32x4_t)tmp0, (int32x4_t)tmp1);
+
+ v_v[0] = vec_msum(in[0], sinpi_1_3_v, zerov);
+ v_v[1] = vec_msum(in[1], sinpi_4_2_v, zerov);
+ v_v[2] = vec_msum(in[0], sinpi_2_3_v, zerov);
+ v_v[3] = vec_msum(in[1], sinpi_1_4_v, zerov);
+ v_v[4] = vec_msum(in[0], sinpi_12_n3_v, zerov);
+
+ in[0] = vec_sub(in[0], in[1]);
+ in[1] = (int16x8_t)vec_sra((int32x4_t)in[1], shift16);
+ in[0] = vec_add(in[0], in[1]);
+ in[0] = (int16x8_t)vec_sl((int32x4_t)in[0], shift16);
+
+ u_v[0] = vec_add(v_v[0], v_v[1]);
+ u_v[1] = vec_sub(v_v[2], v_v[3]);
+ u_v[2] = vec_msum(in[0], sinpi_1_3_v, zerov);
+ u_v[3] = vec_sub(v_v[1], v_v[3]);
+ u_v[3] = vec_add(u_v[3], v_v[4]);
+
+ DCT_CONST_ROUND_SHIFT(u_v[0]);
+ DCT_CONST_ROUND_SHIFT(u_v[1]);
+ DCT_CONST_ROUND_SHIFT(u_v[2]);
+ DCT_CONST_ROUND_SHIFT(u_v[3]);
+
+ out[0] = vec_packs(u_v[0], u_v[1]);
+ out[1] = vec_packs(u_v[2], u_v[3]);
+}
+
+#define MSUM_ROUND_SHIFT(a, b, cospi) \
+ b = vec_msums(a, cospi, zerov); \
+ DCT_CONST_ROUND_SHIFT(b);
+
+#define IADST_WRAPLOW(in0, in1, tmp0, tmp1, out, cospi) \
+ MSUM_ROUND_SHIFT(in0, tmp0, cospi); \
+ MSUM_ROUND_SHIFT(in1, tmp1, cospi); \
+ out = vec_packs(tmp0, tmp1);
+
+void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out) {
+ int32x4_t tmp0[16], tmp1[16];
+
+ int32x4_t zerov = vec_splat_s32(0);
+ int16x8_t zero16v = vec_splat_s16(0);
+ int16x8_t cospi_p02_p30_v = vec_mergel(cospi2_v, cospi30_v);
+ int16x8_t cospi_p30_m02_v = vec_mergel(cospi30_v, cospi2m_v);
+ int16x8_t cospi_p10_p22_v = vec_mergel(cospi10_v, cospi22_v);
+ int16x8_t cospi_p22_m10_v = vec_mergel(cospi22_v, cospi10m_v);
+ int16x8_t cospi_p18_p14_v = vec_mergel(cospi18_v, cospi14_v);
+ int16x8_t cospi_p14_m18_v = vec_mergel(cospi14_v, cospi18m_v);
+ int16x8_t cospi_p26_p06_v = vec_mergel(cospi26_v, cospi6_v);
+ int16x8_t cospi_p06_m26_v = vec_mergel(cospi6_v, cospi26m_v);
+ int16x8_t cospi_p08_p24_v = vec_mergel(cospi8_v, cospi24_v);
+ int16x8_t cospi_p24_m08_v = vec_mergel(cospi24_v, cospi8m_v);
+ int16x8_t cospi_m24_p08_v = vec_mergel(cospi24m_v, cospi8_v);
+ int16x8_t cospi_p16_m16_v = vec_mergel(cospi16_v, cospi16m_v);
+ ROUND_SHIFT_INIT;
+
+ TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0],
+ out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
+
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ in[0] = vec_mergeh(out[7], out[0]);
+ in[1] = vec_mergel(out[7], out[0]);
+ in[2] = vec_mergeh(out[5], out[2]);
+ in[3] = vec_mergel(out[5], out[2]);
+ in[4] = vec_mergeh(out[3], out[4]);
+ in[5] = vec_mergel(out[3], out[4]);
+ in[6] = vec_mergeh(out[1], out[6]);
+ in[7] = vec_mergel(out[1], out[6]);
+
+ tmp1[0] = vec_msum(in[0], cospi_p02_p30_v, zerov);
+ tmp1[1] = vec_msum(in[1], cospi_p02_p30_v, zerov);
+ tmp1[2] = vec_msum(in[0], cospi_p30_m02_v, zerov);
+ tmp1[3] = vec_msum(in[1], cospi_p30_m02_v, zerov);
+ tmp1[4] = vec_msum(in[2], cospi_p10_p22_v, zerov);
+ tmp1[5] = vec_msum(in[3], cospi_p10_p22_v, zerov);
+ tmp1[6] = vec_msum(in[2], cospi_p22_m10_v, zerov);
+ tmp1[7] = vec_msum(in[3], cospi_p22_m10_v, zerov);
+ tmp1[8] = vec_msum(in[4], cospi_p18_p14_v, zerov);
+ tmp1[9] = vec_msum(in[5], cospi_p18_p14_v, zerov);
+ tmp1[10] = vec_msum(in[4], cospi_p14_m18_v, zerov);
+ tmp1[11] = vec_msum(in[5], cospi_p14_m18_v, zerov);
+ tmp1[12] = vec_msum(in[6], cospi_p26_p06_v, zerov);
+ tmp1[13] = vec_msum(in[7], cospi_p26_p06_v, zerov);
+ tmp1[14] = vec_msum(in[6], cospi_p06_m26_v, zerov);
+ tmp1[15] = vec_msum(in[7], cospi_p06_m26_v, zerov);
+
+ tmp0[0] = vec_add(tmp1[0], tmp1[8]);
+ tmp0[1] = vec_add(tmp1[1], tmp1[9]);
+ tmp0[2] = vec_add(tmp1[2], tmp1[10]);
+ tmp0[3] = vec_add(tmp1[3], tmp1[11]);
+ tmp0[4] = vec_add(tmp1[4], tmp1[12]);
+ tmp0[5] = vec_add(tmp1[5], tmp1[13]);
+ tmp0[6] = vec_add(tmp1[6], tmp1[14]);
+ tmp0[7] = vec_add(tmp1[7], tmp1[15]);
+ tmp0[8] = vec_sub(tmp1[0], tmp1[8]);
+ tmp0[9] = vec_sub(tmp1[1], tmp1[9]);
+ tmp0[10] = vec_sub(tmp1[2], tmp1[10]);
+ tmp0[11] = vec_sub(tmp1[3], tmp1[11]);
+ tmp0[12] = vec_sub(tmp1[4], tmp1[12]);
+ tmp0[13] = vec_sub(tmp1[5], tmp1[13]);
+ tmp0[14] = vec_sub(tmp1[6], tmp1[14]);
+ tmp0[15] = vec_sub(tmp1[7], tmp1[15]);
+
+ // shift and rounding
+ DCT_CONST_ROUND_SHIFT(tmp0[0]);
+ DCT_CONST_ROUND_SHIFT(tmp0[1]);
+ DCT_CONST_ROUND_SHIFT(tmp0[2]);
+ DCT_CONST_ROUND_SHIFT(tmp0[3]);
+ DCT_CONST_ROUND_SHIFT(tmp0[4]);
+ DCT_CONST_ROUND_SHIFT(tmp0[5]);
+ DCT_CONST_ROUND_SHIFT(tmp0[6]);
+ DCT_CONST_ROUND_SHIFT(tmp0[7]);
+ DCT_CONST_ROUND_SHIFT(tmp0[8]);
+ DCT_CONST_ROUND_SHIFT(tmp0[9]);
+ DCT_CONST_ROUND_SHIFT(tmp0[10]);
+ DCT_CONST_ROUND_SHIFT(tmp0[11]);
+ DCT_CONST_ROUND_SHIFT(tmp0[12]);
+ DCT_CONST_ROUND_SHIFT(tmp0[13]);
+ DCT_CONST_ROUND_SHIFT(tmp0[14]);
+ DCT_CONST_ROUND_SHIFT(tmp0[15]);
+
+ // back to 16-bit
+ out[0] = vec_packs(tmp0[0], tmp0[1]);
+ out[1] = vec_packs(tmp0[2], tmp0[3]);
+ out[2] = vec_packs(tmp0[4], tmp0[5]);
+ out[3] = vec_packs(tmp0[6], tmp0[7]);
+ out[4] = vec_packs(tmp0[8], tmp0[9]);
+ out[5] = vec_packs(tmp0[10], tmp0[11]);
+ out[6] = vec_packs(tmp0[12], tmp0[13]);
+ out[7] = vec_packs(tmp0[14], tmp0[15]);
+
+ // stage 2
+ in[0] = vec_add(out[0], out[2]);
+ in[1] = vec_add(out[1], out[3]);
+ in[2] = vec_sub(out[0], out[2]);
+ in[3] = vec_sub(out[1], out[3]);
+ in[4] = vec_mergeh(out[4], out[5]);
+ in[5] = vec_mergel(out[4], out[5]);
+ in[6] = vec_mergeh(out[6], out[7]);
+ in[7] = vec_mergel(out[6], out[7]);
+
+ tmp1[0] = vec_msum(in[4], cospi_p08_p24_v, zerov);
+ tmp1[1] = vec_msum(in[5], cospi_p08_p24_v, zerov);
+ tmp1[2] = vec_msum(in[4], cospi_p24_m08_v, zerov);
+ tmp1[3] = vec_msum(in[5], cospi_p24_m08_v, zerov);
+ tmp1[4] = vec_msum(in[6], cospi_m24_p08_v, zerov);
+ tmp1[5] = vec_msum(in[7], cospi_m24_p08_v, zerov);
+ tmp1[6] = vec_msum(in[6], cospi_p08_p24_v, zerov);
+ tmp1[7] = vec_msum(in[7], cospi_p08_p24_v, zerov);
+
+ tmp0[0] = vec_add(tmp1[0], tmp1[4]);
+ tmp0[1] = vec_add(tmp1[1], tmp1[5]);
+ tmp0[2] = vec_add(tmp1[2], tmp1[6]);
+ tmp0[3] = vec_add(tmp1[3], tmp1[7]);
+ tmp0[4] = vec_sub(tmp1[0], tmp1[4]);
+ tmp0[5] = vec_sub(tmp1[1], tmp1[5]);
+ tmp0[6] = vec_sub(tmp1[2], tmp1[6]);
+ tmp0[7] = vec_sub(tmp1[3], tmp1[7]);
+
+ DCT_CONST_ROUND_SHIFT(tmp0[0]);
+ DCT_CONST_ROUND_SHIFT(tmp0[1]);
+ DCT_CONST_ROUND_SHIFT(tmp0[2]);
+ DCT_CONST_ROUND_SHIFT(tmp0[3]);
+ DCT_CONST_ROUND_SHIFT(tmp0[4]);
+ DCT_CONST_ROUND_SHIFT(tmp0[5]);
+ DCT_CONST_ROUND_SHIFT(tmp0[6]);
+ DCT_CONST_ROUND_SHIFT(tmp0[7]);
+
+ in[4] = vec_packs(tmp0[0], tmp0[1]);
+ in[5] = vec_packs(tmp0[2], tmp0[3]);
+ in[6] = vec_packs(tmp0[4], tmp0[5]);
+ in[7] = vec_packs(tmp0[6], tmp0[7]);
+
+ // stage 3
+ out[0] = vec_mergeh(in[2], in[3]);
+ out[1] = vec_mergel(in[2], in[3]);
+ out[2] = vec_mergeh(in[6], in[7]);
+ out[3] = vec_mergel(in[6], in[7]);
+
+ IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[2], cospi16_v);
+ IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[3], cospi_p16_m16_v);
+ IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[6], cospi16_v);
+ IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[7], cospi_p16_m16_v);
+
+ out[0] = in[0];
+ out[2] = in[6];
+ out[4] = in[3];
+ out[6] = in[5];
+
+ out[1] = vec_sub(zero16v, in[4]);
+ out[3] = vec_sub(zero16v, in[2]);
+ out[5] = vec_sub(zero16v, in[7]);
+ out[7] = vec_sub(zero16v, in[1]);
+}
+
+static void iadst16x8_vsx(int16x8_t *in, int16x8_t *out) {
+ int32x4_t tmp0[32], tmp1[32];
+ int16x8_t tmp16_0[8];
+ int16x8_t cospi_p01_p31 = vec_mergel(cospi1_v, cospi31_v);
+ int16x8_t cospi_p31_m01 = vec_mergel(cospi31_v, cospi1m_v);
+ int16x8_t cospi_p05_p27 = vec_mergel(cospi5_v, cospi27_v);
+ int16x8_t cospi_p27_m05 = vec_mergel(cospi27_v, cospi5m_v);
+ int16x8_t cospi_p09_p23 = vec_mergel(cospi9_v, cospi23_v);
+ int16x8_t cospi_p23_m09 = vec_mergel(cospi23_v, cospi9m_v);
+ int16x8_t cospi_p13_p19 = vec_mergel(cospi13_v, cospi19_v);
+ int16x8_t cospi_p19_m13 = vec_mergel(cospi19_v, cospi13m_v);
+ int16x8_t cospi_p17_p15 = vec_mergel(cospi17_v, cospi15_v);
+ int16x8_t cospi_p15_m17 = vec_mergel(cospi15_v, cospi17m_v);
+ int16x8_t cospi_p21_p11 = vec_mergel(cospi21_v, cospi11_v);
+ int16x8_t cospi_p11_m21 = vec_mergel(cospi11_v, cospi21m_v);
+ int16x8_t cospi_p25_p07 = vec_mergel(cospi25_v, cospi7_v);
+ int16x8_t cospi_p07_m25 = vec_mergel(cospi7_v, cospi25m_v);
+ int16x8_t cospi_p29_p03 = vec_mergel(cospi29_v, cospi3_v);
+ int16x8_t cospi_p03_m29 = vec_mergel(cospi3_v, cospi29m_v);
+ int16x8_t cospi_p04_p28 = vec_mergel(cospi4_v, cospi28_v);
+ int16x8_t cospi_p28_m04 = vec_mergel(cospi28_v, cospi4m_v);
+ int16x8_t cospi_p20_p12 = vec_mergel(cospi20_v, cospi12_v);
+ int16x8_t cospi_p12_m20 = vec_mergel(cospi12_v, cospi20m_v);
+ int16x8_t cospi_m28_p04 = vec_mergel(cospi28m_v, cospi4_v);
+ int16x8_t cospi_m12_p20 = vec_mergel(cospi12m_v, cospi20_v);
+ int16x8_t cospi_p08_p24 = vec_mergel(cospi8_v, cospi24_v);
+ int16x8_t cospi_p24_m08 = vec_mergel(cospi24_v, cospi8m_v);
+ int16x8_t cospi_m24_p08 = vec_mergel(cospi24m_v, cospi8_v);
+ int32x4_t zerov = vec_splat_s32(0);
+ ROUND_SHIFT_INIT;
+
+ tmp16_0[0] = vec_mergeh(in[15], in[0]);
+ tmp16_0[1] = vec_mergel(in[15], in[0]);
+ tmp16_0[2] = vec_mergeh(in[13], in[2]);
+ tmp16_0[3] = vec_mergel(in[13], in[2]);
+ tmp16_0[4] = vec_mergeh(in[11], in[4]);
+ tmp16_0[5] = vec_mergel(in[11], in[4]);
+ tmp16_0[6] = vec_mergeh(in[9], in[6]);
+ tmp16_0[7] = vec_mergel(in[9], in[6]);
+ tmp16_0[8] = vec_mergeh(in[7], in[8]);
+ tmp16_0[9] = vec_mergel(in[7], in[8]);
+ tmp16_0[10] = vec_mergeh(in[5], in[10]);
+ tmp16_0[11] = vec_mergel(in[5], in[10]);
+ tmp16_0[12] = vec_mergeh(in[3], in[12]);
+ tmp16_0[13] = vec_mergel(in[3], in[12]);
+ tmp16_0[14] = vec_mergeh(in[1], in[14]);
+ tmp16_0[15] = vec_mergel(in[1], in[14]);
+
+ tmp0[0] = vec_msum(tmp16_0[0], cospi_p01_p31, zerov);
+ tmp0[1] = vec_msum(tmp16_0[1], cospi_p01_p31, zerov);
+ tmp0[2] = vec_msum(tmp16_0[0], cospi_p31_m01, zerov);
+ tmp0[3] = vec_msum(tmp16_0[1], cospi_p31_m01, zerov);
+ tmp0[4] = vec_msum(tmp16_0[2], cospi_p05_p27, zerov);
+ tmp0[5] = vec_msum(tmp16_0[3], cospi_p05_p27, zerov);
+ tmp0[6] = vec_msum(tmp16_0[2], cospi_p27_m05, zerov);
+ tmp0[7] = vec_msum(tmp16_0[3], cospi_p27_m05, zerov);
+ tmp0[8] = vec_msum(tmp16_0[4], cospi_p09_p23, zerov);
+ tmp0[9] = vec_msum(tmp16_0[5], cospi_p09_p23, zerov);
+ tmp0[10] = vec_msum(tmp16_0[4], cospi_p23_m09, zerov);
+ tmp0[11] = vec_msum(tmp16_0[5], cospi_p23_m09, zerov);
+ tmp0[12] = vec_msum(tmp16_0[6], cospi_p13_p19, zerov);
+ tmp0[13] = vec_msum(tmp16_0[7], cospi_p13_p19, zerov);
+ tmp0[14] = vec_msum(tmp16_0[6], cospi_p19_m13, zerov);
+ tmp0[15] = vec_msum(tmp16_0[7], cospi_p19_m13, zerov);
+ tmp0[16] = vec_msum(tmp16_0[8], cospi_p17_p15, zerov);
+ tmp0[17] = vec_msum(tmp16_0[9], cospi_p17_p15, zerov);
+ tmp0[18] = vec_msum(tmp16_0[8], cospi_p15_m17, zerov);
+ tmp0[19] = vec_msum(tmp16_0[9], cospi_p15_m17, zerov);
+ tmp0[20] = vec_msum(tmp16_0[10], cospi_p21_p11, zerov);
+ tmp0[21] = vec_msum(tmp16_0[11], cospi_p21_p11, zerov);
+ tmp0[22] = vec_msum(tmp16_0[10], cospi_p11_m21, zerov);
+ tmp0[23] = vec_msum(tmp16_0[11], cospi_p11_m21, zerov);
+ tmp0[24] = vec_msum(tmp16_0[12], cospi_p25_p07, zerov);
+ tmp0[25] = vec_msum(tmp16_0[13], cospi_p25_p07, zerov);
+ tmp0[26] = vec_msum(tmp16_0[12], cospi_p07_m25, zerov);
+ tmp0[27] = vec_msum(tmp16_0[13], cospi_p07_m25, zerov);
+ tmp0[28] = vec_msum(tmp16_0[14], cospi_p29_p03, zerov);
+ tmp0[29] = vec_msum(tmp16_0[15], cospi_p29_p03, zerov);
+ tmp0[30] = vec_msum(tmp16_0[14], cospi_p03_m29, zerov);
+ tmp0[31] = vec_msum(tmp16_0[15], cospi_p03_m29, zerov);
+
+ tmp1[0] = vec_add(tmp0[0], tmp0[16]);
+ tmp1[1] = vec_add(tmp0[1], tmp0[17]);
+ tmp1[2] = vec_add(tmp0[2], tmp0[18]);
+ tmp1[3] = vec_add(tmp0[3], tmp0[19]);
+ tmp1[4] = vec_add(tmp0[4], tmp0[20]);
+ tmp1[5] = vec_add(tmp0[5], tmp0[21]);
+ tmp1[6] = vec_add(tmp0[6], tmp0[22]);
+ tmp1[7] = vec_add(tmp0[7], tmp0[23]);
+ tmp1[8] = vec_add(tmp0[8], tmp0[24]);
+ tmp1[9] = vec_add(tmp0[9], tmp0[25]);
+ tmp1[10] = vec_add(tmp0[10], tmp0[26]);
+ tmp1[11] = vec_add(tmp0[11], tmp0[27]);
+ tmp1[12] = vec_add(tmp0[12], tmp0[28]);
+ tmp1[13] = vec_add(tmp0[13], tmp0[29]);
+ tmp1[14] = vec_add(tmp0[14], tmp0[30]);
+ tmp1[15] = vec_add(tmp0[15], tmp0[31]);
+ tmp1[16] = vec_sub(tmp0[0], tmp0[16]);
+ tmp1[17] = vec_sub(tmp0[1], tmp0[17]);
+ tmp1[18] = vec_sub(tmp0[2], tmp0[18]);
+ tmp1[19] = vec_sub(tmp0[3], tmp0[19]);
+ tmp1[20] = vec_sub(tmp0[4], tmp0[20]);
+ tmp1[21] = vec_sub(tmp0[5], tmp0[21]);
+ tmp1[22] = vec_sub(tmp0[6], tmp0[22]);
+ tmp1[23] = vec_sub(tmp0[7], tmp0[23]);
+ tmp1[24] = vec_sub(tmp0[8], tmp0[24]);
+ tmp1[25] = vec_sub(tmp0[9], tmp0[25]);
+ tmp1[26] = vec_sub(tmp0[10], tmp0[26]);
+ tmp1[27] = vec_sub(tmp0[11], tmp0[27]);
+ tmp1[28] = vec_sub(tmp0[12], tmp0[28]);
+ tmp1[29] = vec_sub(tmp0[13], tmp0[29]);
+ tmp1[30] = vec_sub(tmp0[14], tmp0[30]);
+ tmp1[31] = vec_sub(tmp0[15], tmp0[31]);
+
+ DCT_CONST_ROUND_SHIFT(tmp1[0]);
+ DCT_CONST_ROUND_SHIFT(tmp1[1]);
+ DCT_CONST_ROUND_SHIFT(tmp1[2]);
+ DCT_CONST_ROUND_SHIFT(tmp1[3]);
+ DCT_CONST_ROUND_SHIFT(tmp1[4]);
+ DCT_CONST_ROUND_SHIFT(tmp1[5]);
+ DCT_CONST_ROUND_SHIFT(tmp1[6]);
+ DCT_CONST_ROUND_SHIFT(tmp1[7]);
+ DCT_CONST_ROUND_SHIFT(tmp1[8]);
+ DCT_CONST_ROUND_SHIFT(tmp1[9]);
+ DCT_CONST_ROUND_SHIFT(tmp1[10]);
+ DCT_CONST_ROUND_SHIFT(tmp1[11]);
+ DCT_CONST_ROUND_SHIFT(tmp1[12]);
+ DCT_CONST_ROUND_SHIFT(tmp1[13]);
+ DCT_CONST_ROUND_SHIFT(tmp1[14]);
+ DCT_CONST_ROUND_SHIFT(tmp1[15]);
+ DCT_CONST_ROUND_SHIFT(tmp1[16]);
+ DCT_CONST_ROUND_SHIFT(tmp1[17]);
+ DCT_CONST_ROUND_SHIFT(tmp1[18]);
+ DCT_CONST_ROUND_SHIFT(tmp1[19]);
+ DCT_CONST_ROUND_SHIFT(tmp1[20]);
+ DCT_CONST_ROUND_SHIFT(tmp1[21]);
+ DCT_CONST_ROUND_SHIFT(tmp1[22]);
+ DCT_CONST_ROUND_SHIFT(tmp1[23]);
+ DCT_CONST_ROUND_SHIFT(tmp1[24]);
+ DCT_CONST_ROUND_SHIFT(tmp1[25]);
+ DCT_CONST_ROUND_SHIFT(tmp1[26]);
+ DCT_CONST_ROUND_SHIFT(tmp1[27]);
+ DCT_CONST_ROUND_SHIFT(tmp1[28]);
+ DCT_CONST_ROUND_SHIFT(tmp1[29]);
+ DCT_CONST_ROUND_SHIFT(tmp1[30]);
+ DCT_CONST_ROUND_SHIFT(tmp1[31]);
+
+ in[0] = vec_packs(tmp1[0], tmp1[1]);
+ in[1] = vec_packs(tmp1[2], tmp1[3]);
+ in[2] = vec_packs(tmp1[4], tmp1[5]);
+ in[3] = vec_packs(tmp1[6], tmp1[7]);
+ in[4] = vec_packs(tmp1[8], tmp1[9]);
+ in[5] = vec_packs(tmp1[10], tmp1[11]);
+ in[6] = vec_packs(tmp1[12], tmp1[13]);
+ in[7] = vec_packs(tmp1[14], tmp1[15]);
+ in[8] = vec_packs(tmp1[16], tmp1[17]);
+ in[9] = vec_packs(tmp1[18], tmp1[19]);
+ in[10] = vec_packs(tmp1[20], tmp1[21]);
+ in[11] = vec_packs(tmp1[22], tmp1[23]);
+ in[12] = vec_packs(tmp1[24], tmp1[25]);
+ in[13] = vec_packs(tmp1[26], tmp1[27]);
+ in[14] = vec_packs(tmp1[28], tmp1[29]);
+ in[15] = vec_packs(tmp1[30], tmp1[31]);
+
+ // stage 2
+ tmp16_0[0] = vec_mergeh(in[8], in[9]);
+ tmp16_0[1] = vec_mergel(in[8], in[9]);
+ tmp16_0[2] = vec_mergeh(in[10], in[11]);
+ tmp16_0[3] = vec_mergel(in[10], in[11]);
+ tmp16_0[4] = vec_mergeh(in[12], in[13]);
+ tmp16_0[5] = vec_mergel(in[12], in[13]);
+ tmp16_0[6] = vec_mergeh(in[14], in[15]);
+ tmp16_0[7] = vec_mergel(in[14], in[15]);
+
+ tmp0[0] = vec_msum(tmp16_0[0], cospi_p04_p28, zerov);
+ tmp0[1] = vec_msum(tmp16_0[1], cospi_p04_p28, zerov);
+ tmp0[2] = vec_msum(tmp16_0[0], cospi_p28_m04, zerov);
+ tmp0[3] = vec_msum(tmp16_0[1], cospi_p28_m04, zerov);
+ tmp0[4] = vec_msum(tmp16_0[2], cospi_p20_p12, zerov);
+ tmp0[5] = vec_msum(tmp16_0[3], cospi_p20_p12, zerov);
+ tmp0[6] = vec_msum(tmp16_0[2], cospi_p12_m20, zerov);
+ tmp0[7] = vec_msum(tmp16_0[3], cospi_p12_m20, zerov);
+ tmp0[8] = vec_msum(tmp16_0[4], cospi_m28_p04, zerov);
+ tmp0[9] = vec_msum(tmp16_0[5], cospi_m28_p04, zerov);
+ tmp0[10] = vec_msum(tmp16_0[4], cospi_p04_p28, zerov);
+ tmp0[11] = vec_msum(tmp16_0[5], cospi_p04_p28, zerov);
+ tmp0[12] = vec_msum(tmp16_0[6], cospi_m12_p20, zerov);
+ tmp0[13] = vec_msum(tmp16_0[7], cospi_m12_p20, zerov);
+ tmp0[14] = vec_msum(tmp16_0[6], cospi_p20_p12, zerov);
+ tmp0[15] = vec_msum(tmp16_0[7], cospi_p20_p12, zerov);
+
+ tmp1[0] = vec_add(tmp0[0], tmp0[8]);
+ tmp1[1] = vec_add(tmp0[1], tmp0[9]);
+ tmp1[2] = vec_add(tmp0[2], tmp0[10]);
+ tmp1[3] = vec_add(tmp0[3], tmp0[11]);
+ tmp1[4] = vec_add(tmp0[4], tmp0[12]);
+ tmp1[5] = vec_add(tmp0[5], tmp0[13]);
+ tmp1[6] = vec_add(tmp0[6], tmp0[14]);
+ tmp1[7] = vec_add(tmp0[7], tmp0[15]);
+ tmp1[8] = vec_sub(tmp0[0], tmp0[8]);
+ tmp1[9] = vec_sub(tmp0[1], tmp0[9]);
+ tmp1[10] = vec_sub(tmp0[2], tmp0[10]);
+ tmp1[11] = vec_sub(tmp0[3], tmp0[11]);
+ tmp1[12] = vec_sub(tmp0[4], tmp0[12]);
+ tmp1[13] = vec_sub(tmp0[5], tmp0[13]);
+ tmp1[14] = vec_sub(tmp0[6], tmp0[14]);
+ tmp1[15] = vec_sub(tmp0[7], tmp0[15]);
+
+ DCT_CONST_ROUND_SHIFT(tmp1[0]);
+ DCT_CONST_ROUND_SHIFT(tmp1[1]);
+ DCT_CONST_ROUND_SHIFT(tmp1[2]);
+ DCT_CONST_ROUND_SHIFT(tmp1[3]);
+ DCT_CONST_ROUND_SHIFT(tmp1[4]);
+ DCT_CONST_ROUND_SHIFT(tmp1[5]);
+ DCT_CONST_ROUND_SHIFT(tmp1[6]);
+ DCT_CONST_ROUND_SHIFT(tmp1[7]);
+ DCT_CONST_ROUND_SHIFT(tmp1[8]);
+ DCT_CONST_ROUND_SHIFT(tmp1[9]);
+ DCT_CONST_ROUND_SHIFT(tmp1[10]);
+ DCT_CONST_ROUND_SHIFT(tmp1[11]);
+ DCT_CONST_ROUND_SHIFT(tmp1[12]);
+ DCT_CONST_ROUND_SHIFT(tmp1[13]);
+ DCT_CONST_ROUND_SHIFT(tmp1[14]);
+ DCT_CONST_ROUND_SHIFT(tmp1[15]);
+
+ tmp16_0[0] = vec_add(in[0], in[4]);
+ tmp16_0[1] = vec_add(in[1], in[5]);
+ tmp16_0[2] = vec_add(in[2], in[6]);
+ tmp16_0[3] = vec_add(in[3], in[7]);
+ tmp16_0[4] = vec_sub(in[0], in[4]);
+ tmp16_0[5] = vec_sub(in[1], in[5]);
+ tmp16_0[6] = vec_sub(in[2], in[6]);
+ tmp16_0[7] = vec_sub(in[3], in[7]);
+ tmp16_0[8] = vec_packs(tmp1[0], tmp1[1]);
+ tmp16_0[9] = vec_packs(tmp1[2], tmp1[3]);
+ tmp16_0[10] = vec_packs(tmp1[4], tmp1[5]);
+ tmp16_0[11] = vec_packs(tmp1[6], tmp1[7]);
+ tmp16_0[12] = vec_packs(tmp1[8], tmp1[9]);
+ tmp16_0[13] = vec_packs(tmp1[10], tmp1[11]);
+ tmp16_0[14] = vec_packs(tmp1[12], tmp1[13]);
+ tmp16_0[15] = vec_packs(tmp1[14], tmp1[15]);
+
+ // stage 3
+ in[0] = vec_mergeh(tmp16_0[4], tmp16_0[5]);
+ in[1] = vec_mergel(tmp16_0[4], tmp16_0[5]);
+ in[2] = vec_mergeh(tmp16_0[6], tmp16_0[7]);
+ in[3] = vec_mergel(tmp16_0[6], tmp16_0[7]);
+ in[4] = vec_mergeh(tmp16_0[12], tmp16_0[13]);
+ in[5] = vec_mergel(tmp16_0[12], tmp16_0[13]);
+ in[6] = vec_mergeh(tmp16_0[14], tmp16_0[15]);
+ in[7] = vec_mergel(tmp16_0[14], tmp16_0[15]);
+
+ tmp0[0] = vec_msum(in[0], cospi_p08_p24, zerov);
+ tmp0[1] = vec_msum(in[1], cospi_p08_p24, zerov);
+ tmp0[2] = vec_msum(in[0], cospi_p24_m08, zerov);
+ tmp0[3] = vec_msum(in[1], cospi_p24_m08, zerov);
+ tmp0[4] = vec_msum(in[2], cospi_m24_p08, zerov);
+ tmp0[5] = vec_msum(in[3], cospi_m24_p08, zerov);
+ tmp0[6] = vec_msum(in[2], cospi_p08_p24, zerov);
+ tmp0[7] = vec_msum(in[3], cospi_p08_p24, zerov);
+ tmp0[8] = vec_msum(in[4], cospi_p08_p24, zerov);
+ tmp0[9] = vec_msum(in[5], cospi_p08_p24, zerov);
+ tmp0[10] = vec_msum(in[4], cospi_p24_m08, zerov);
+ tmp0[11] = vec_msum(in[5], cospi_p24_m08, zerov);
+ tmp0[12] = vec_msum(in[6], cospi_m24_p08, zerov);
+ tmp0[13] = vec_msum(in[7], cospi_m24_p08, zerov);
+ tmp0[14] = vec_msum(in[6], cospi_p08_p24, zerov);
+ tmp0[15] = vec_msum(in[7], cospi_p08_p24, zerov);
+
+ tmp1[0] = vec_add(tmp0[0], tmp0[4]);
+ tmp1[1] = vec_add(tmp0[1], tmp0[5]);
+ tmp1[2] = vec_add(tmp0[2], tmp0[6]);
+ tmp1[3] = vec_add(tmp0[3], tmp0[7]);
+ tmp1[4] = vec_sub(tmp0[0], tmp0[4]);
+ tmp1[5] = vec_sub(tmp0[1], tmp0[5]);
+ tmp1[6] = vec_sub(tmp0[2], tmp0[6]);
+ tmp1[7] = vec_sub(tmp0[3], tmp0[7]);
+ tmp1[8] = vec_add(tmp0[8], tmp0[12]);
+ tmp1[9] = vec_add(tmp0[9], tmp0[13]);
+ tmp1[10] = vec_add(tmp0[10], tmp0[14]);
+ tmp1[11] = vec_add(tmp0[11], tmp0[15]);
+ tmp1[12] = vec_sub(tmp0[8], tmp0[12]);
+ tmp1[13] = vec_sub(tmp0[9], tmp0[13]);
+ tmp1[14] = vec_sub(tmp0[10], tmp0[14]);
+ tmp1[15] = vec_sub(tmp0[11], tmp0[15]);
+
+ DCT_CONST_ROUND_SHIFT(tmp1[0]);
+ DCT_CONST_ROUND_SHIFT(tmp1[1]);
+ DCT_CONST_ROUND_SHIFT(tmp1[2]);
+ DCT_CONST_ROUND_SHIFT(tmp1[3]);
+ DCT_CONST_ROUND_SHIFT(tmp1[4]);
+ DCT_CONST_ROUND_SHIFT(tmp1[5]);
+ DCT_CONST_ROUND_SHIFT(tmp1[6]);
+ DCT_CONST_ROUND_SHIFT(tmp1[7]);
+ DCT_CONST_ROUND_SHIFT(tmp1[8]);
+ DCT_CONST_ROUND_SHIFT(tmp1[9]);
+ DCT_CONST_ROUND_SHIFT(tmp1[10]);
+ DCT_CONST_ROUND_SHIFT(tmp1[11]);
+ DCT_CONST_ROUND_SHIFT(tmp1[12]);
+ DCT_CONST_ROUND_SHIFT(tmp1[13]);
+ DCT_CONST_ROUND_SHIFT(tmp1[14]);
+ DCT_CONST_ROUND_SHIFT(tmp1[15]);
+
+ in[0] = vec_add(tmp16_0[0], tmp16_0[2]);
+ in[1] = vec_add(tmp16_0[1], tmp16_0[3]);
+ in[2] = vec_sub(tmp16_0[0], tmp16_0[2]);
+ in[3] = vec_sub(tmp16_0[1], tmp16_0[3]);
+ in[4] = vec_packs(tmp1[0], tmp1[1]);
+ in[5] = vec_packs(tmp1[2], tmp1[3]);
+ in[6] = vec_packs(tmp1[4], tmp1[5]);
+ in[7] = vec_packs(tmp1[6], tmp1[7]);
+ in[8] = vec_add(tmp16_0[8], tmp16_0[10]);
+ in[9] = vec_add(tmp16_0[9], tmp16_0[11]);
+ in[10] = vec_sub(tmp16_0[8], tmp16_0[10]);
+ in[11] = vec_sub(tmp16_0[9], tmp16_0[11]);
+ in[12] = vec_packs(tmp1[8], tmp1[9]);
+ in[13] = vec_packs(tmp1[10], tmp1[11]);
+ in[14] = vec_packs(tmp1[12], tmp1[13]);
+ in[15] = vec_packs(tmp1[14], tmp1[15]);
+
+ // stage 4
+ out[0] = vec_mergeh(in[2], in[3]);
+ out[1] = vec_mergel(in[2], in[3]);
+ out[2] = vec_mergeh(in[6], in[7]);
+ out[3] = vec_mergel(in[6], in[7]);
+ out[4] = vec_mergeh(in[10], in[11]);
+ out[5] = vec_mergel(in[10], in[11]);
+ out[6] = vec_mergeh(in[14], in[15]);
+ out[7] = vec_mergel(in[14], in[15]);
+}
+
+void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1) {
+ int16x8_t tmp0[16], tmp1[16], tmp2[8];
+ int32x4_t tmp3, tmp4;
+ int16x8_t zero16v = vec_splat_s16(0);
+ int32x4_t zerov = vec_splat_s32(0);
+ int16x8_t cospi_p16_m16 = vec_mergel(cospi16_v, cospi16m_v);
+ int16x8_t cospi_m16_p16 = vec_mergel(cospi16m_v, cospi16_v);
+ ROUND_SHIFT_INIT;
+
+ TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+ src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+ tmp0[6], tmp0[7]);
+ TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
+ src1[14], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+ tmp1[6], tmp1[7]);
+ TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+ src0[15], tmp0[8], tmp0[9], tmp0[10], tmp0[11], tmp0[12],
+ tmp0[13], tmp0[14], tmp0[15]);
+ TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
+ src1[15], tmp1[8], tmp1[9], tmp1[10], tmp1[11], tmp1[12],
+ tmp1[13], tmp1[14], tmp1[15]);
+
+ iadst16x8_vsx(tmp0, tmp2);
+ IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[14], cospi16m_v);
+ IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[0], cospi_p16_m16);
+ IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[8], cospi16_v);
+ IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[6], cospi_m16_p16);
+ IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[12], cospi16_v);
+ IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[2], cospi_m16_p16);
+ IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[10], cospi16m_v);
+ IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[4], cospi_p16_m16);
+
+ src0[0] = tmp0[0];
+ src0[2] = vec_sub(zero16v, tmp0[8]);
+ src0[4] = tmp0[12];
+ src0[6] = vec_sub(zero16v, tmp0[4]);
+ src1[8] = tmp0[5];
+ src1[10] = vec_sub(zero16v, tmp0[13]);
+ src1[12] = tmp0[9];
+ src1[14] = vec_sub(zero16v, tmp0[1]);
+
+ iadst16x8_vsx(tmp1, tmp2);
+ IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[15], cospi16m_v);
+ IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[1], cospi_p16_m16);
+ IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[9], cospi16_v);
+ IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[7], cospi_m16_p16);
+ IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[13], cospi16_v);
+ IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[3], cospi_m16_p16);
+ IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[11], cospi16m_v);
+ IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[5], cospi_p16_m16);
+
+ src0[1] = tmp1[0];
+ src0[3] = vec_sub(zero16v, tmp1[8]);
+ src0[5] = tmp1[12];
+ src0[7] = vec_sub(zero16v, tmp1[4]);
+ src1[9] = tmp1[5];
+ src1[11] = vec_sub(zero16v, tmp1[13]);
+ src1[13] = tmp1[9];
+ src1[15] = vec_sub(zero16v, tmp1[1]);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h
new file mode 100644
index 0000000000..7031742c1c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_
+#define VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_
+
+#include "vpx_dsp/ppc/types_vsx.h"
+
+void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest,
+ int stride);
+void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out);
+void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out);
+
+void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride);
+void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out);
+void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out);
+
+#define LOAD_INPUT16(load, source, offset, step, in) \
+ in[0] = load(offset, source); \
+ in[1] = load((step) + (offset), source); \
+ in[2] = load(2 * (step) + (offset), source); \
+ in[3] = load(3 * (step) + (offset), source); \
+ in[4] = load(4 * (step) + (offset), source); \
+ in[5] = load(5 * (step) + (offset), source); \
+ in[6] = load(6 * (step) + (offset), source); \
+ in[7] = load(7 * (step) + (offset), source); \
+ in[8] = load(8 * (step) + (offset), source); \
+ in[9] = load(9 * (step) + (offset), source); \
+ in[10] = load(10 * (step) + (offset), source); \
+ in[11] = load(11 * (step) + (offset), source); \
+ in[12] = load(12 * (step) + (offset), source); \
+ in[13] = load(13 * (step) + (offset), source); \
+ in[14] = load(14 * (step) + (offset), source); \
+ in[15] = load(15 * (step) + (offset), source);
+
+void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest,
+ int stride);
+void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1);
+void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1);
+
+#endif // VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c
new file mode 100644
index 0000000000..ab71f6e235
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) {
+ const int16x8_t mask = vec_sra(b, vec_shift_sign_s16);
+ return vec_xor(vec_add(a, mask), mask);
+}
+
+// Sets the value of a 32-bit integers to 1 when the corresponding value in a is
+// negative.
+static INLINE int32x4_t vec_is_neg(int32x4_t a) {
+ return vec_sr(a, vec_shift_sign_s32);
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and return the high 16 bits of the intermediate integers.
+// (a * b) >> 16
+static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) {
+ // madds does ((A * B) >>15) + C, we need >> 16, so we perform an extra right
+ // shift.
+ return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16);
+}
+
+// Quantization function used for 4x4, 8x8 and 16x16 blocks.
+static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs,
+ int16x8_t round, int16x8_t quant,
+ int16x8_t quant_shift, bool16x8_t mask) {
+ const int16x8_t rounded = vec_vaddshs(coeff_abs, round);
+ int16x8_t qcoeff = vec_mulhi(rounded, quant);
+ qcoeff = vec_add(qcoeff, rounded);
+ qcoeff = vec_mulhi(qcoeff, quant_shift);
+ qcoeff = vec_sign(qcoeff, coeff);
+ return vec_and(qcoeff, mask);
+}
+
+// Quantization function used for 32x32 blocks.
+static INLINE int16x8_t quantize_coeff_32(int16x8_t coeff, int16x8_t coeff_abs,
+ int16x8_t round, int16x8_t quant,
+ int16x8_t quant_shift,
+ bool16x8_t mask) {
+ const int16x8_t rounded = vec_vaddshs(coeff_abs, round);
+ int16x8_t qcoeff = vec_mulhi(rounded, quant);
+ qcoeff = vec_add(qcoeff, rounded);
+ // 32x32 blocks require an extra multiplication by 2, this compensates for the
+ // extra right shift added in vec_mulhi, as such vec_madds can be used
+ // directly instead of vec_mulhi (((a * b) >> 15) >> 1) << 1 == (a * b >> 15)
+ qcoeff = vec_madds(qcoeff, quant_shift, vec_zeros_s16);
+ qcoeff = vec_sign(qcoeff, coeff);
+ return vec_and(qcoeff, mask);
+}
+
+// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32
+// blocks are twice as big as for other block sizes. As such, using
+// vec_mladd results in overflow.
+static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
+ int16x8_t dequant) {
+ int32x4_t dqcoeffe = vec_mule(qcoeff, dequant);
+ int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant);
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe));
+ dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo));
+ dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32);
+ dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32);
+ return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
+}
+
+static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff,
+ const int16_t *iscan_ptr, int index) {
+ int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
+ bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
+ return vec_andc(scan, zero_coeff);
+}
+
+// Compare packed 16-bit integers across a, and return the maximum value in
+// every element. Returns a vector containing the biggest value across vector a.
+static INLINE int16x8_t vec_max_across(int16x8_t a) {
+ a = vec_max(a, vec_perm(a, a, vec_perm64));
+ a = vec_max(a, vec_perm(a, a, vec_perm32));
+ return vec_max(a, vec_perm(a, a, vec_perm16));
+}
+
+void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+ bool16x8_t zero_mask0, zero_mask1;
+
+ // First set of 8 coeff starts with DC + 7 AC
+ int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
+ int16x8_t round = vec_vsx_ld(0, round_ptr);
+ int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+ int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+ int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
+
+ int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+ int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+
+ int16x8_t coeff0_abs = vec_abs(coeff0);
+ int16x8_t coeff1_abs = vec_abs(coeff1);
+
+ zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+ zbin = vec_splat(zbin, 1);
+ zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+
+ (void)scan_ptr;
+
+ qcoeff0 =
+ quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0);
+ vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+ round = vec_splat(round, 1);
+ quant = vec_splat(quant, 1);
+ quant_shift = vec_splat(quant_shift, 1);
+ qcoeff1 =
+ quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1);
+ vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+ dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+ vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+ dequant = vec_splat(dequant, 1);
+ dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+ vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+ eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+ nonzero_scanindex(qcoeff1, iscan_ptr, 16));
+
+ if (n_coeffs > 16) {
+ int index = 16;
+ int off0 = 32;
+ int off1 = 48;
+ int off2 = 64;
+ do {
+ int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2;
+ bool16x8_t zero_mask2;
+ coeff0 = vec_vsx_ld(off0, coeff_ptr);
+ coeff1 = vec_vsx_ld(off1, coeff_ptr);
+ coeff2 = vec_vsx_ld(off2, coeff_ptr);
+ coeff0_abs = vec_abs(coeff0);
+ coeff1_abs = vec_abs(coeff1);
+ coeff2_abs = vec_abs(coeff2);
+ zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+ zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+ zero_mask2 = vec_cmpge(coeff2_abs, zbin);
+ qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift,
+ zero_mask0);
+ qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift,
+ zero_mask1);
+ qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift,
+ zero_mask2);
+ vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+ vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+ vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+ dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+ dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+ dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16);
+
+ vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+ vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+ vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+ eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+ eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+ nonzero_scanindex(qcoeff2, iscan_ptr, off2));
+ eob = vec_max(eob, eob2);
+
+ index += 24;
+ off0 += 48;
+ off1 += 48;
+ off2 += 48;
+ } while (index < n_coeffs);
+ }
+
+ eob = vec_max_across(eob);
+ *eob_ptr = eob[0];
+}
+
+void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ // In stage 1, we quantize 16 coeffs (DC + 15 AC)
+ // In stage 2, we loop 42 times and quantize 24 coeffs per iteration
+ // (32 * 32 - 16) / 24 = 42
+ int num_itr = 42;
+ // Offsets are in bytes, 16 coeffs = 32 bytes
+ int off0 = 32;
+ int off1 = 48;
+ int off2 = 64;
+
+ int16x8_t qcoeff0, qcoeff1, eob;
+ bool16x8_t zero_mask0, zero_mask1;
+
+ int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
+ int16x8_t round = vec_vsx_ld(0, round_ptr);
+ int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+ int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+ int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
+
+ int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+ int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+
+ int16x8_t coeff0_abs = vec_abs(coeff0);
+ int16x8_t coeff1_abs = vec_abs(coeff1);
+
+ (void)scan_ptr;
+ (void)n_coeffs;
+
+ // 32x32 quantization requires that zbin and round be divided by 2
+ zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16);
+ round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16);
+
+ zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+ zbin = vec_splat(zbin, 1); // remove DC from zbin
+ zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+
+ qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift,
+ zero_mask0);
+ round = vec_splat(round, 1); // remove DC from round
+ quant = vec_splat(quant, 1); // remove DC from quant
+ quant_shift = vec_splat(quant_shift, 1); // remove DC from quant_shift
+ qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift,
+ zero_mask1);
+
+ vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+ vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+ vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), 0, dqcoeff_ptr);
+ dequant = vec_splat(dequant, 1); // remove DC from dequant
+ vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr);
+
+ eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+ nonzero_scanindex(qcoeff1, iscan_ptr, 16));
+
+ do {
+ int16x8_t coeff2, coeff2_abs, qcoeff2, eob2;
+ bool16x8_t zero_mask2;
+
+ coeff0 = vec_vsx_ld(off0, coeff_ptr);
+ coeff1 = vec_vsx_ld(off1, coeff_ptr);
+ coeff2 = vec_vsx_ld(off2, coeff_ptr);
+
+ coeff0_abs = vec_abs(coeff0);
+ coeff1_abs = vec_abs(coeff1);
+ coeff2_abs = vec_abs(coeff2);
+
+ zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+ zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+ zero_mask2 = vec_cmpge(coeff2_abs, zbin);
+
+ qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift,
+ zero_mask0);
+ qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift,
+ zero_mask1);
+ qcoeff2 = quantize_coeff_32(coeff2, coeff2_abs, round, quant, quant_shift,
+ zero_mask2);
+
+ vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+ vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+ vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+ vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), off0, dqcoeff_ptr);
+ vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr);
+ vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr);
+
+ eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+ eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+ nonzero_scanindex(qcoeff2, iscan_ptr, off2));
+ eob = vec_max(eob, eob2);
+
+ // 24 int16_t is 48 bytes
+ off0 += 48;
+ off1 += 48;
+ off2 += 48;
+ num_itr--;
+ } while (num_itr != 0);
+
+ eob = vec_max_across(eob);
+ *eob_ptr = eob[0];
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c
new file mode 100644
index 0000000000..a08ae12413
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/ppc/types_vsx.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#define PROCESS16(offset) \
+ v_a = vec_vsx_ld(offset, a); \
+ v_b = vec_vsx_ld(offset, b); \
+ v_abs = vec_absd(v_a, v_b); \
+ v_sad = vec_sum4s(v_abs, v_sad);
+
+#define SAD8(height) \
+ unsigned int vpx_sad8x##height##_vsx(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride) { \
+ int y = 0; \
+ uint8x16_t v_a, v_b, v_abs; \
+ uint32x4_t v_sad = vec_zeros_u32; \
+ \
+ do { \
+ PROCESS16(0) \
+ \
+ a += a_stride; \
+ b += b_stride; \
+ y++; \
+ } while (y < height); \
+ \
+ return v_sad[1] + v_sad[0]; \
+ }
+
+#define SAD16(height) \
+ unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride) { \
+ int y = 0; \
+ uint8x16_t v_a, v_b, v_abs; \
+ uint32x4_t v_sad = vec_zeros_u32; \
+ \
+ do { \
+ PROCESS16(0); \
+ \
+ a += a_stride; \
+ b += b_stride; \
+ y++; \
+ } while (y < height); \
+ \
+ return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \
+ }
+
+#define SAD32(height) \
+ unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride) { \
+ int y = 0; \
+ uint8x16_t v_a, v_b, v_abs; \
+ uint32x4_t v_sad = vec_zeros_u32; \
+ \
+ do { \
+ PROCESS16(0); \
+ PROCESS16(16); \
+ \
+ a += a_stride; \
+ b += b_stride; \
+ y++; \
+ } while (y < height); \
+ \
+ return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \
+ }
+
+#define SAD64(height) \
+ unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride) { \
+ int y = 0; \
+ uint8x16_t v_a, v_b, v_abs; \
+ uint32x4_t v_sad = vec_zeros_u32; \
+ \
+ do { \
+ PROCESS16(0); \
+ PROCESS16(16); \
+ PROCESS16(32); \
+ PROCESS16(48); \
+ \
+ a += a_stride; \
+ b += b_stride; \
+ y++; \
+ } while (y < height); \
+ \
+ return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \
+ }
+
+SAD8(4);
+SAD8(8);
+SAD8(16);
+SAD16(8);
+SAD16(16);
+SAD16(32);
+SAD32(16);
+SAD32(32);
+SAD32(64);
+SAD64(32);
+SAD64(64);
+
+#define SAD16AVG(height) \
+ unsigned int vpx_sad16x##height##_avg_vsx( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * (height)]); \
+ vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref, \
+ ref_stride); \
+ \
+ return vpx_sad16x##height##_vsx(src, src_stride, comp_pred, 16); \
+ }
+
+#define SAD32AVG(height) \
+ unsigned int vpx_sad32x##height##_avg_vsx( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * (height)]); \
+ vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref, \
+ ref_stride); \
+ \
+ return vpx_sad32x##height##_vsx(src, src_stride, comp_pred, 32); \
+ }
+
+#define SAD64AVG(height) \
+ unsigned int vpx_sad64x##height##_avg_vsx( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * (height)]); \
+ vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref, \
+ ref_stride); \
+ return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64); \
+ }
+
+SAD16AVG(8);
+SAD16AVG(16);
+SAD16AVG(32);
+SAD32AVG(16);
+SAD32AVG(32);
+SAD32AVG(64);
+SAD64AVG(32);
+SAD64AVG(64);
+
+#define PROCESS16_4D(offset, ref, v_h, v_l) \
+ v_b = vec_vsx_ld(offset, ref); \
+ v_bh = unpack_to_s16_h(v_b); \
+ v_bl = unpack_to_s16_l(v_b); \
+ v_subh = vec_sub(v_h, v_bh); \
+ v_subl = vec_sub(v_l, v_bl); \
+ v_absh = vec_abs(v_subh); \
+ v_absl = vec_abs(v_subl); \
+ v_sad = vec_sum4s(v_absh, v_sad); \
+ v_sad = vec_sum4s(v_absl, v_sad);
+
+#define UNPACK_SRC(offset, srcv_h, srcv_l) \
+ v_a = vec_vsx_ld(offset, src); \
+ srcv_h = unpack_to_s16_h(v_a); \
+ srcv_l = unpack_to_s16_l(v_a);
+
+#define SAD16_4D(height) \
+ void vpx_sad16x##height##x4d_vsx(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ int y; \
+ unsigned int sad[4]; \
+ uint8x16_t v_a, v_b; \
+ int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \
+ \
+ for (i = 0; i < 4; i++) sad_array[i] = 0; \
+ \
+ for (y = 0; y < height; y++) { \
+ UNPACK_SRC(y *src_stride, v_ah, v_al); \
+ for (i = 0; i < 4; i++) { \
+ int32x4_t v_sad = vec_splat_s32(0); \
+ PROCESS16_4D(y *ref_stride, ref_array[i], v_ah, v_al); \
+ \
+ vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
+ sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \
+ } \
+ } \
+ }
+
+#define SAD32_4D(height) \
+ void vpx_sad32x##height##x4d_vsx(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ int y; \
+ unsigned int sad[4]; \
+ uint8x16_t v_a, v_b; \
+ int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \
+ int16x8_t v_absh, v_absl, v_subh, v_subl; \
+ \
+ for (i = 0; i < 4; i++) sad_array[i] = 0; \
+ \
+ for (y = 0; y < height; y++) { \
+ UNPACK_SRC(y *src_stride, v_ah1, v_al1); \
+ UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \
+ for (i = 0; i < 4; i++) { \
+ int32x4_t v_sad = vec_splat_s32(0); \
+ PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \
+ PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \
+ \
+ vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
+ sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \
+ } \
+ } \
+ }
+
+#define SAD64_4D(height) \
+ void vpx_sad64x##height##x4d_vsx(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ int y; \
+ unsigned int sad[4]; \
+ uint8x16_t v_a, v_b; \
+ int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \
+ int16x8_t v_ah3, v_al3, v_ah4, v_al4; \
+ int16x8_t v_absh, v_absl, v_subh, v_subl; \
+ \
+ for (i = 0; i < 4; i++) sad_array[i] = 0; \
+ \
+ for (y = 0; y < height; y++) { \
+ UNPACK_SRC(y *src_stride, v_ah1, v_al1); \
+ UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \
+ UNPACK_SRC(y *src_stride + 32, v_ah3, v_al3); \
+ UNPACK_SRC(y *src_stride + 48, v_ah4, v_al4); \
+ for (i = 0; i < 4; i++) { \
+ int32x4_t v_sad = vec_splat_s32(0); \
+ PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \
+ PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \
+ PROCESS16_4D(y *ref_stride + 32, ref_array[i], v_ah3, v_al3); \
+ PROCESS16_4D(y *ref_stride + 48, ref_array[i], v_ah4, v_al4); \
+ \
+ vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
+ sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \
+ } \
+ } \
+ }
+
+SAD16_4D(8);
+SAD16_4D(16);
+SAD16_4D(32);
+SAD32_4D(16);
+SAD32_4D(32);
+SAD32_4D(64);
+SAD64_4D(32);
+SAD64_4D(64);
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c
new file mode 100644
index 0000000000..76ad302da6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static VPX_FORCE_INLINE void subtract_block4x4(
+ int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) {
+ int16_t *diff1 = diff + 2 * diff_stride;
+ const uint8_t *src1 = src + 2 * src_stride;
+ const uint8_t *pred1 = pred + 2 * pred_stride;
+
+ const int16x8_t d0 = vec_vsx_ld(0, diff);
+ const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride);
+ const int16x8_t d2 = vec_vsx_ld(0, diff1);
+ const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride);
+
+ const uint8x16_t s0 = read4x2(src, (int)src_stride);
+ const uint8x16_t p0 = read4x2(pred, (int)pred_stride);
+ const uint8x16_t s1 = read4x2(src1, (int)src_stride);
+ const uint8x16_t p1 = read4x2(pred1, (int)pred_stride);
+
+ const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+ const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
+
+ vec_vsx_st(xxpermdi(da, d0, 1), 0, diff);
+ vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride);
+ vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1);
+ vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride);
+}
+
+void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
+ int r = rows, c;
+
+ switch (cols) {
+ case 64:
+ case 32:
+ do {
+ for (c = 0; c < cols; c += 32) {
+ const uint8x16_t s0 = vec_vsx_ld(0, src + c);
+ const uint8x16_t s1 = vec_vsx_ld(16, src + c);
+ const uint8x16_t p0 = vec_vsx_ld(0, pred + c);
+ const uint8x16_t p1 = vec_vsx_ld(16, pred + c);
+ const int16x8_t d0l =
+ vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
+ const int16x8_t d0h =
+ vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+ const int16x8_t d1l =
+ vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1));
+ const int16x8_t d1h =
+ vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
+ vec_vsx_st(d0h, 0, diff + c);
+ vec_vsx_st(d0l, 16, diff + c);
+ vec_vsx_st(d1h, 0, diff + c + 16);
+ vec_vsx_st(d1l, 16, diff + c + 16);
+ }
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ break;
+ case 16:
+ do {
+ const uint8x16_t s0 = vec_vsx_ld(0, src);
+ const uint8x16_t p0 = vec_vsx_ld(0, pred);
+ const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
+ const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+ vec_vsx_st(d0h, 0, diff);
+ vec_vsx_st(d0l, 16, diff);
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ break;
+ case 8:
+ do {
+ const uint8x16_t s0 = vec_vsx_ld(0, src);
+ const uint8x16_t p0 = vec_vsx_ld(0, pred);
+ const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+ vec_vsx_st(d0h, 0, diff);
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ break;
+ case 4:
+ subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride);
+ if (r > 4) {
+ diff += 4 * diff_stride;
+ pred += 4 * pred_stride;
+ src += 4 * src_stride;
+
+ subtract_block4x4(diff, diff_stride,
+
+ src, src_stride,
+
+ pred, pred_stride);
+ }
+ break;
+ default: assert(0); // unreachable
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h
new file mode 100644
index 0000000000..4883b734ad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_
+#define VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) {
+ // d = vec_mergeh(a,b):
+ // The even elements of the result are obtained left-to-right,
+ // from the high elements of a.
+ // The odd elements of the result are obtained left-to-right,
+ // from the high elements of b.
+ //
+ // d = vec_mergel(a,b):
+ // The even elements of the result are obtained left-to-right,
+ // from the low elements of a.
+ // The odd elements of the result are obtained left-to-right,
+ // from the low elements of b.
+
+ // Example, starting with:
+ // v[0]: 00 01 02 03 04 05 06 07
+ // v[1]: 10 11 12 13 14 15 16 17
+ // v[2]: 20 21 22 23 24 25 26 27
+ // v[3]: 30 31 32 33 34 35 36 37
+ // v[4]: 40 41 42 43 44 45 46 47
+ // v[5]: 50 51 52 53 54 55 56 57
+ // v[6]: 60 61 62 63 64 65 66 67
+ // v[7]: 70 71 72 73 74 75 76 77
+
+ int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+ int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+
+ b0 = vec_mergeh(v[0], v[4]);
+ b1 = vec_mergel(v[0], v[4]);
+ b2 = vec_mergeh(v[1], v[5]);
+ b3 = vec_mergel(v[1], v[5]);
+ b4 = vec_mergeh(v[2], v[6]);
+ b5 = vec_mergel(v[2], v[6]);
+ b6 = vec_mergeh(v[3], v[7]);
+ b7 = vec_mergel(v[3], v[7]);
+
+ // After first merge operation
+ // b0: 00 40 01 41 02 42 03 43
+ // b1: 04 44 05 45 06 46 07 47
+ // b2: 10 50 11 51 12 52 13 53
+ // b3: 14 54 15 55 16 56 17 57
+ // b4: 20 60 21 61 22 62 23 63
+ // b5: 24 64 25 65 26 66 27 67
+ // b6: 30 70 31 71 32 62 33 73
+ // b7: 34 74 35 75 36 76 37 77
+
+ c0 = vec_mergeh(b0, b4);
+ c1 = vec_mergel(b0, b4);
+ c2 = vec_mergeh(b1, b5);
+ c3 = vec_mergel(b1, b5);
+ c4 = vec_mergeh(b2, b6);
+ c5 = vec_mergel(b2, b6);
+ c6 = vec_mergeh(b3, b7);
+ c7 = vec_mergel(b3, b7);
+
+ // After second merge operation
+ // c0: 00 20 40 60 01 21 41 61
+ // c1: 02 22 42 62 03 23 43 63
+ // c2: 04 24 44 64 05 25 45 65
+ // c3: 06 26 46 66 07 27 47 67
+ // c4: 10 30 50 70 11 31 51 71
+ // c5: 12 32 52 72 13 33 53 73
+ // c6: 14 34 54 74 15 35 55 75
+ // c7: 16 36 56 76 17 37 57 77
+
+ v[0] = vec_mergeh(c0, c4);
+ v[1] = vec_mergel(c0, c4);
+ v[2] = vec_mergeh(c1, c5);
+ v[3] = vec_mergel(c1, c5);
+ v[4] = vec_mergeh(c2, c6);
+ v[5] = vec_mergel(c2, c6);
+ v[6] = vec_mergeh(c3, c7);
+ v[7] = vec_mergel(c3, c7);
+
+ // After last merge operation
+ // v[0]: 00 10 20 30 40 50 60 70
+ // v[1]: 01 11 21 31 41 51 61 71
+ // v[2]: 02 12 22 32 42 52 62 72
+ // v[3]: 03 13 23 33 43 53 63 73
+ // v[4]: 04 14 24 34 44 54 64 74
+ // v[5]: 05 15 25 35 45 55 65 75
+ // v[6]: 06 16 26 36 46 56 66 76
+ // v[7]: 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
+ // Stage 1
+ const int16x8_t s1_0 = vec_mergeh(a[0], a[4]);
+ const int16x8_t s1_1 = vec_mergel(a[0], a[4]);
+ const int16x8_t s1_2 = vec_mergeh(a[1], a[5]);
+ const int16x8_t s1_3 = vec_mergel(a[1], a[5]);
+ const int16x8_t s1_4 = vec_mergeh(a[2], a[6]);
+ const int16x8_t s1_5 = vec_mergel(a[2], a[6]);
+ const int16x8_t s1_6 = vec_mergeh(a[3], a[7]);
+ const int16x8_t s1_7 = vec_mergel(a[3], a[7]);
+
+ // Stage 2
+ const int16x8_t s2_0 = vec_mergeh(s1_0, s1_4);
+ const int16x8_t s2_1 = vec_mergel(s1_0, s1_4);
+ const int16x8_t s2_2 = vec_mergeh(s1_1, s1_5);
+ const int16x8_t s2_3 = vec_mergel(s1_1, s1_5);
+ const int16x8_t s2_4 = vec_mergeh(s1_2, s1_6);
+ const int16x8_t s2_5 = vec_mergel(s1_2, s1_6);
+ const int16x8_t s2_6 = vec_mergeh(s1_3, s1_7);
+ const int16x8_t s2_7 = vec_mergel(s1_3, s1_7);
+
+ // Stage 2
+ b[0] = vec_mergeh(s2_0, s2_4);
+ b[1] = vec_mergel(s2_0, s2_4);
+ b[2] = vec_mergeh(s2_1, s2_5);
+ b[3] = vec_mergel(s2_1, s2_5);
+ b[4] = vec_mergeh(s2_2, s2_6);
+ b[5] = vec_mergel(s2_2, s2_6);
+ b[6] = vec_mergeh(s2_3, s2_7);
+ b[7] = vec_mergel(s2_3, s2_7);
+}
+
+#endif // VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h
new file mode 100644
index 0000000000..2907a1fe40
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_
+#define VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_
+
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static const int32x4_t vec_dct_const_rounding = { 8192, 8192, 8192, 8192 };
+
+static const uint32x4_t vec_dct_const_bits = { 14, 14, 14, 14 };
+
+static const uint16x8_t vec_dct_scale_log2 = { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
+ 16364, 16364, 16364, 16364 };
+static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
+ 16305, 16305, 16305, 16305 };
+static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
+ 16207, 16207, 16207, 16207 };
+static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
+ 16069, 16069, 16069, 16069 };
+static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
+ -16069, -16069, -16069, -16069 };
+static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
+ 15893, 15893, 15893, 15893 };
+static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
+ 15679, 15679, 15679, 15679 };
+static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
+ 15426, 15426, 15426, 15426 };
+static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
+ 15137, 15137, 15137, 15137 };
+static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
+ -15137, -15137, -15137, -15137 };
+static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
+ 14811, 14811, 14811, 14811 };
+static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
+ 14449, 14449, 14449, 14449 };
+static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
+ 14053, 14053, 14053, 14053 };
+static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
+ 13623, 13623, 13623, 13623 };
+static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
+ 13160, 13160, 13160, 13160 };
+static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
+ 12665, 12665, 12665, 12665 };
+static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
+ 12140, 12140, 12140, 12140 };
+static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
+ 11585, 11585, 11585, 11585 };
+static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
+ 11003, 11003, 11003, 11003 };
+static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
+ 10394, 10394, 10394, 10394 };
+static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760,
+ 9760, 9760, 9760, 9760 };
+static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102,
+ 9102, 9102, 9102, 9102 };
+static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
+ -9102, -9102, -9102, -9102 };
+static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423,
+ 8423, 8423, 8423, 8423 };
+static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723,
+ 7723, 7723, 7723, 7723 };
+static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005,
+ 7005, 7005, 7005, 7005 };
+static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270,
+ 6270, 6270, 6270, 6270 };
+static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520,
+ 5520, 5520, 5520, 5520 };
+static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756,
+ 4756, 4756, 4756, 4756 };
+static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981,
+ 3981, 3981, 3981, 3981 };
+static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196,
+ 3196, 3196, 3196, 3196 };
+static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404,
+ 2404, 2404, 2404, 2404 };
+static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606,
+ 1606, 1606, 1606, 1606 };
+static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
+
+#endif // VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h
new file mode 100644
index 0000000000..b891169245
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_TYPES_VSX_H_
+#define VPX_VPX_DSP_PPC_TYPES_VSX_H_
+
+#include <altivec.h>
+
+typedef vector signed char int8x16_t;
+typedef vector unsigned char uint8x16_t;
+typedef vector signed short int16x8_t;
+typedef vector unsigned short uint16x8_t;
+typedef vector signed int int32x4_t;
+typedef vector unsigned int uint32x4_t;
+typedef vector bool char bool8x16_t;
+typedef vector bool short bool16x8_t;
+typedef vector bool int bool32x4_t;
+
+#if defined(__clang__) && __clang_major__ < 6
+static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+ 0x14, 0x15, 0x16, 0x17 };
+static const uint8x16_t xxpermdi1_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B,
+ 0x1C, 0x1D, 0x1E, 0x1F };
+static const uint8x16_t xxpermdi2_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+ 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13,
+ 0x14, 0x15, 0x16, 0x17 };
+static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+ 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B,
+ 0x1C, 0x1D, 0x1E, 0x1F };
+#define xxpermdi(a, b, c) vec_perm(a, b, xxpermdi##c##_perm)
+#elif defined(__GNUC__) && \
+ (__GNUC__ > 6 || (__GNUC__ == 6 && __GNUC_MINOR__ >= 3))
+#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c)
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define unpack_to_u16_h(v) \
+ (uint16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_u16_l(v) \
+ (uint16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_s16_h(v) \
+ (int16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_s16_l(v) \
+ (int16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v)
+#ifndef xxpermdi
+#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c)
+#endif
+#else
+#define unpack_to_u16_h(v) \
+ (uint16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_u16_l(v) \
+ (uint16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_s16_h(v) \
+ (int16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_s16_l(v) \
+ (int16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0))
+#ifndef xxpermdi
+#define xxpermdi(a, b, c) vec_xxpermdi(b, a, (((c) >> 1) | ((c)&1) << 1) ^ 3)
+#endif
+#endif
+
+static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) {
+ const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
+ const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
+
+ return (uint8x16_t)vec_mergeh(a0, a1);
+}
+
+#ifndef __POWER9_VECTOR__
+#define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
+#endif
+
+static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+static const int16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static const int16x8_t vec_twos_s16 = { 2, 2, 2, 2, 2, 2, 2, 2 };
+static const uint16x8_t vec_ones_u16 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static const uint32x4_t vec_ones_u32 = { 1, 1, 1, 1 };
+static const int32x4_t vec_zeros_s32 = { 0, 0, 0, 0 };
+static const uint32x4_t vec_zeros_u32 = { 0, 0, 0, 0 };
+static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 };
+static const uint32x4_t vec_shift_sign_s32 = { 31, 31, 31, 31 };
+static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+ 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x06, 0x07 };
+static const uint8x16_t vec_perm32 = { 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+ 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03 };
+static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D,
+ 0x0E, 0x0F, 0x00, 0x01 };
+
+static const uint8x16_t vec_perm_odd_even_pack = { 0x00, 0x01, 0x10, 0x11,
+ 0x04, 0x05, 0x14, 0x15,
+ 0x08, 0x09, 0x18, 0x19,
+ 0x0C, 0x0D, 0x1C, 0x1D };
+
+#endif // VPX_VPX_DSP_PPC_TYPES_VSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c
new file mode 100644
index 0000000000..be9614a358
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ int distortion;
+
+ const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
+ const int16x8_t a1 =
+ unpack_to_s16_h(read4x2(src_ptr + src_stride * 2, src_stride));
+ const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
+ const int16x8_t b1 =
+ unpack_to_s16_h(read4x2(ref_ptr + ref_stride * 2, ref_stride));
+ const int16x8_t d0 = vec_sub(a0, b0);
+ const int16x8_t d1 = vec_sub(a1, b1);
+ const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0)));
+ const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3);
+
+ vec_ste(d, 0, &distortion);
+
+ return distortion;
+}
+
+// TODO(lu_zero): Unroll
+uint32_t vpx_get_mb_ss_vsx(const int16_t *src_ptr) {
+ unsigned int i, sum = 0;
+ int32x4_t s = vec_splat_s32(0);
+
+ for (i = 0; i < 256; i += 8) {
+ const int16x8_t v = vec_vsx_ld(0, src_ptr + i);
+ s = vec_msum(v, v, s);
+ }
+
+ s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
+
+ vec_ste((uint32x4_t)s, 0, &sum);
+
+ return sum;
+}
+
+void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ int i, j;
+ /* comp_pred and pred must be 16 byte aligned. */
+ assert(((intptr_t)comp_pred & 0xf) == 0);
+ assert(((intptr_t)pred & 0xf) == 0);
+ if (width >= 16) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 16) {
+ const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref));
+ vec_vsx_st(v, j, comp_pred);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else if (width == 8) {
+ // Process 2 lines at time
+ for (i = 0; i < height / 2; ++i) {
+ const uint8x16_t r0 = vec_vsx_ld(0, ref);
+ const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride);
+ const uint8x16_t r = xxpermdi(r0, r1, 0);
+ const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
+ vec_vsx_st(v, 0, comp_pred);
+ comp_pred += 16; // width * 2;
+ pred += 16; // width * 2;
+ ref += ref_stride * 2;
+ }
+ } else {
+ assert(width == 4);
+ // process 4 lines at time
+ for (i = 0; i < height / 4; ++i) {
+ const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref);
+ const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride);
+ const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2);
+ const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3);
+ const uint8x16_t r =
+ (uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0);
+ const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
+ vec_vsx_st(v, 0, comp_pred);
+ comp_pred += 16; // width * 4;
+ pred += 16; // width * 4;
+ ref += ref_stride * 4;
+ }
+ }
+}
+
+static INLINE void variance_inner_32(const uint8_t *src_ptr,
+ const uint8_t *ref_ptr,
+ int32x4_t *sum_squared, int32x4_t *sum) {
+ int32x4_t s = *sum;
+ int32x4_t ss = *sum_squared;
+
+ const uint8x16_t va0 = vec_vsx_ld(0, src_ptr);
+ const uint8x16_t vb0 = vec_vsx_ld(0, ref_ptr);
+ const uint8x16_t va1 = vec_vsx_ld(16, src_ptr);
+ const uint8x16_t vb1 = vec_vsx_ld(16, ref_ptr);
+
+ const int16x8_t a0 = unpack_to_s16_h(va0);
+ const int16x8_t b0 = unpack_to_s16_h(vb0);
+ const int16x8_t a1 = unpack_to_s16_l(va0);
+ const int16x8_t b1 = unpack_to_s16_l(vb0);
+ const int16x8_t a2 = unpack_to_s16_h(va1);
+ const int16x8_t b2 = unpack_to_s16_h(vb1);
+ const int16x8_t a3 = unpack_to_s16_l(va1);
+ const int16x8_t b3 = unpack_to_s16_l(vb1);
+ const int16x8_t d0 = vec_sub(a0, b0);
+ const int16x8_t d1 = vec_sub(a1, b1);
+ const int16x8_t d2 = vec_sub(a2, b2);
+ const int16x8_t d3 = vec_sub(a3, b3);
+
+ s = vec_sum4s(d0, s);
+ ss = vec_msum(d0, d0, ss);
+ s = vec_sum4s(d1, s);
+ ss = vec_msum(d1, d1, ss);
+ s = vec_sum4s(d2, s);
+ ss = vec_msum(d2, d2, ss);
+ s = vec_sum4s(d3, s);
+ ss = vec_msum(d3, d3, ss);
+ *sum = s;
+ *sum_squared = ss;
+}
+
+static INLINE void variance(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum) {
+ int i;
+
+ int32x4_t s = vec_splat_s32(0);
+ int32x4_t ss = vec_splat_s32(0);
+
+ switch (w) {
+ case 4:
+ for (i = 0; i < h / 2; ++i) {
+ const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
+ const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
+ const int16x8_t d = vec_sub(a0, b0);
+ s = vec_sum4s(d, s);
+ ss = vec_msum(d, d, ss);
+ src_ptr += src_stride * 2;
+ ref_ptr += ref_stride * 2;
+ }
+ break;
+ case 8:
+ for (i = 0; i < h; ++i) {
+ const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, src_ptr));
+ const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, ref_ptr));
+ const int16x8_t d = vec_sub(a0, b0);
+
+ s = vec_sum4s(d, s);
+ ss = vec_msum(d, d, ss);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+ break;
+ case 16:
+ for (i = 0; i < h; ++i) {
+ const uint8x16_t va = vec_vsx_ld(0, src_ptr);
+ const uint8x16_t vb = vec_vsx_ld(0, ref_ptr);
+ const int16x8_t a0 = unpack_to_s16_h(va);
+ const int16x8_t b0 = unpack_to_s16_h(vb);
+ const int16x8_t a1 = unpack_to_s16_l(va);
+ const int16x8_t b1 = unpack_to_s16_l(vb);
+ const int16x8_t d0 = vec_sub(a0, b0);
+ const int16x8_t d1 = vec_sub(a1, b1);
+
+ s = vec_sum4s(d0, s);
+ ss = vec_msum(d0, d0, ss);
+ s = vec_sum4s(d1, s);
+ ss = vec_msum(d1, d1, ss);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+ break;
+ case 32:
+ for (i = 0; i < h; ++i) {
+ variance_inner_32(src_ptr, ref_ptr, &ss, &s);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+ break;
+ case 64:
+ for (i = 0; i < h; ++i) {
+ variance_inner_32(src_ptr, ref_ptr, &ss, &s);
+ variance_inner_32(src_ptr + 32, ref_ptr + 32, &ss, &s);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+ break;
+ }
+
+ s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
+
+ vec_ste(s, 0, sum);
+
+ ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3);
+
+ vec_ste((uint32x4_t)ss, 0, sse);
+}
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H) \
+ void vpx_get##W##x##H##var_vsx(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \
+ }
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H) \
+ uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+ }
+
+#define VAR(W, H) \
+ uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / ((W) * (H))); \
+ }
+
+#define VARIANCES(W, H) VAR(W, H)
+
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
new file mode 100644
index 0000000000..2dc66055cc
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+#include "vpx_dsp/vpx_filter.h"
+
+// TODO(lu_zero): unroll
+static VPX_FORCE_INLINE void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+ vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+ vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
+ vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
+ vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ switch (w) {
+ case 16: {
+ copy_w16(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ copy_w32(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ copy_w64(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ int i;
+ for (i = h; i--;) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
+
+static VPX_FORCE_INLINE void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+ vec_vsx_st(v, 0, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+ const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
+ vec_vsx_st(v0, 0, dst);
+ vec_vsx_st(v1, 16, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+ const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
+ const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst));
+ const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst));
+ vec_vsx_st(v0, 0, dst);
+ vec_vsx_st(v1, 16, dst);
+ vec_vsx_st(v2, 32, dst);
+ vec_vsx_st(v3, 48, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
+ switch (w) {
+ case 16: {
+ avg_w16(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ avg_w32(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ avg_w64(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
+
+static VPX_FORCE_INLINE void convolve_line(uint8_t *dst, const int16x8_t s,
+ const int16x8_t f) {
+ const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0));
+ const int32x4_t bias =
+ vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1));
+ const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS));
+ const uint8x16_t v = vec_splat(
+ vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3);
+ vec_ste(v, 0, dst);
+}
+
+static VPX_FORCE_INLINE void convolve_line_h(uint8_t *dst,
+ const uint8_t *const src_x,
+ const int16_t *const x_filter) {
+ const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x));
+ const int16x8_t f = vec_vsx_ld(0, x_filter);
+
+ convolve_line(dst, s, f);
+}
+
+// TODO(lu_zero): Implement 8x8 and bigger block special cases
+static VPX_FORCE_INLINE void convolve_horiz(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const InterpKernel *x_filters,
+ int x0_q4, int x_step_q4, int w,
+ int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS],
+ x_filters[x_q4 & SUBPEL_MASK]);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void convolve_avg_horiz(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ uint8_t v;
+ convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS],
+ x_filters[x_q4 & SUBPEL_MASK]);
+ dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b,
+ uint8x16_t c, uint8x16_t d,
+ uint8x16_t e, uint8x16_t f,
+ uint8x16_t g, uint8x16_t h) {
+ uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b);
+ uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d);
+ uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f);
+ uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h);
+
+ uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd);
+ uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh);
+
+ return (uint8x16_t)vec_mergeh(abcd, efgh);
+}
+
+static VPX_FORCE_INLINE void convolve_line_v(uint8_t *dst,
+ const uint8_t *const src_y,
+ ptrdiff_t src_stride,
+ const int16_t *const y_filter) {
+ uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride);
+ uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride);
+ uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride);
+ uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride);
+ uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride);
+ uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride);
+ uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride);
+ uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride);
+ const int16x8_t f = vec_vsx_ld(0, y_filter);
+ uint8_t buf[16];
+ const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7);
+
+ vec_vsx_st(s, 0, buf);
+
+ convolve_line(dst, unpack_to_s16_h(s), f);
+}
+
+static VPX_FORCE_INLINE void convolve_vert(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const InterpKernel *y_filters,
+ int y0_q4, int y_step_q4, int w,
+ int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ convolve_line_v(dst + y * dst_stride,
+ &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
+ y_filters[y_q4 & SUBPEL_MASK]);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static VPX_FORCE_INLINE void convolve_avg_vert(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ uint8_t v;
+ convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
+ y_filters[y_q4 & SUBPEL_MASK]);
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static VPX_FORCE_INLINE void convolve(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *const filter,
+ int x0_q4, int x_step_q4, int y0_q4,
+ int y_step_q4, int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+ filter, x0_q4, x_step_q4, w, intermediate_height);
+ convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+ y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+ h);
+}
+
+void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ w, h);
+}
+
+void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ (void)x0_q4;
+ (void)x_step_q4;
+
+ convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+ h);
+}
+
+void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ (void)x0_q4;
+ (void)x_step_q4;
+
+ convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+ w, h);
+}
+
+void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ // Fixed size intermediate buffer places limits on parameters.
+ DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+ assert(w <= 64);
+ assert(h <= 64);
+
+ vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, h);
+ vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/prob.c b/media/libvpx/libvpx/vpx_dsp/prob.c
new file mode 100644
index 0000000000..819e95062e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/prob.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./prob.h"
+
+const uint8_t vpx_norm[256] = {
+ 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static unsigned int tree_merge_probs_impl(unsigned int i,
+ const vpx_tree_index *tree,
+ const vpx_prob *pre_probs,
+ const unsigned int *counts,
+ vpx_prob *probs) {
+ const int l = tree[i];
+ const unsigned int left_count =
+ (l <= 0) ? counts[-l]
+ : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
+ const int r = tree[i + 1];
+ const unsigned int right_count =
+ (r <= 0) ? counts[-r]
+ : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
+ const unsigned int ct[2] = { left_count, right_count };
+ probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
+ return left_count + right_count;
+}
+
+void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
+ const unsigned int *counts, vpx_prob *probs) {
+ tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/prob.h b/media/libvpx/libvpx/vpx_dsp/prob.h
new file mode 100644
index 0000000000..7a71c0041f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/prob.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PROB_H_
+#define VPX_VPX_DSP_PROB_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_common.h"
+
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint8_t vpx_prob;
+
+#define MAX_PROB 255
+
+#define vpx_prob_half ((vpx_prob)128)
+
+typedef int8_t vpx_tree_index;
+
+#define TREE_SIZE(leaf_count) (2 * (leaf_count)-2)
+
+#define vpx_complement(x) (255 - (x))
+
+#define MODE_MV_COUNT_SAT 20
+
+/* We build coding trees compactly in arrays.
+ Each node of the tree is a pair of vpx_tree_indices.
+ Array index often references a corresponding probability table.
+ Index <= 0 means done encoding/decoding and value = -Index,
+ Index > 0 means need another bit, specification at index.
+ Nonnegative indices are always even; processing begins at node 0. */
+
+typedef const vpx_tree_index vpx_tree[];
+
+static INLINE vpx_prob get_prob(unsigned int num, unsigned int den) {
+ assert(den != 0);
+ {
+ const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
+ // (p > 255) ? 255 : (p < 1) ? 1 : p;
+ const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
+ return (vpx_prob)clipped_prob;
+ }
+}
+
+static INLINE vpx_prob get_binary_prob(unsigned int n0, unsigned int n1) {
+ const unsigned int den = n0 + n1;
+ if (den == 0) return 128u;
+ return get_prob(n0, den);
+}
+
+/* This function assumes prob1 and prob2 are already within [1,255] range. */
+static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) {
+ return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
+}
+
+static INLINE vpx_prob merge_probs(vpx_prob pre_prob, const unsigned int ct[2],
+ unsigned int count_sat,
+ unsigned int max_update_factor) {
+ const vpx_prob prob = get_binary_prob(ct[0], ct[1]);
+ const unsigned int count = VPXMIN(ct[0] + ct[1], count_sat);
+ const unsigned int factor = max_update_factor * count / count_sat;
+ return weighted_prob(pre_prob, prob, factor);
+}
+
+// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
+static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
+ 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
+ 70, 76, 83, 89, 96, 102, 108, 115, 121, 128
+};
+
+static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob,
+ const unsigned int ct[2]) {
+ const unsigned int den = ct[0] + ct[1];
+ if (den == 0) {
+ return pre_prob;
+ } else {
+ const unsigned int count = VPXMIN(den, MODE_MV_COUNT_SAT);
+ const unsigned int factor = count_to_update_factor[count];
+ const vpx_prob prob = get_prob(ct[0], den);
+ return weighted_prob(pre_prob, prob, factor);
+ }
+}
+
+void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
+ const unsigned int *counts, vpx_prob *probs);
+
+DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_PROB_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/psnr.c b/media/libvpx/libvpx/vpx_dsp/psnr.c
new file mode 100644
index 0000000000..f0d4e927ae
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/psnr.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/psnr.h"
+#include "vpx_scale/yv12config.h"
+
+double vpx_sse_to_psnr(double samples, double peak, double sse) {
+ if (sse > 0.0) {
+ const double psnr = 10.0 * log10(samples * peak * peak / sse);
+ return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+ } else {
+ return MAX_PSNR;
+ }
+}
+
+/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
+ * and highbd_8_variance(). It should not.
+ */
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h) {
+ int i, j;
+ int64_t sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+
+ return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w,
+ int h) {
+ int i, j;
+ int64_t sse = 0;
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+
+ return sse;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ const int dw = width % 16;
+ const int dh = height % 16;
+ int64_t total_sse = 0;
+ int x, y;
+
+ if (dw > 0) {
+ total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+ dw, height);
+ }
+
+ if (dh > 0) {
+ total_sse +=
+ encoder_sse(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride, width - dw, dh);
+ }
+
+ for (y = 0; y < height / 16; ++y) {
+ const uint8_t *pa = a;
+ const uint8_t *pb = b;
+ unsigned int sse;
+ for (x = 0; x < width / 16; ++x) {
+ vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
+ total_sse += sse;
+
+ pa += 16;
+ pb += 16;
+ }
+
+ a += 16 * a_stride;
+ b += 16 * b_stride;
+ }
+
+ return total_sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int width,
+ int height, unsigned int input_shift) {
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ int64_t total_sse = 0;
+ int x, y;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; ++x) {
+ int64_t diff;
+ diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+ total_sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+ return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int64_t total_sse = 0;
+ int x, y;
+ const int dw = width % 16;
+ const int dh = height % 16;
+ if (dw > 0) {
+ total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
+ b_stride, dw, height);
+ }
+ if (dh > 0) {
+ total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride,
+ width - dw, dh);
+ }
+ for (y = 0; y < height / 16; ++y) {
+ const uint8_t *pa = a;
+ const uint8_t *pb = b;
+ unsigned int sse;
+ for (x = 0; x < width / 16; ++x) {
+ vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
+ total_sse += sse;
+ pa += 16;
+ pb += 16;
+ }
+ a += 16 * a_stride;
+ b += 16 * b_stride;
+ }
+ return total_sse;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->y_crop_width == b->y_crop_width);
+ assert(a->y_crop_height == b->y_crop_height);
+
+ return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->y_crop_width == b->y_crop_width);
+ assert(a->y_crop_height == b->y_crop_height);
+ assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+ assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+ return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+ uint32_t bit_depth, uint32_t in_bit_depth) {
+ const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+ const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+ a->uv_crop_height };
+ const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
+ const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+ const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
+ const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+ int i;
+ uint64_t total_sse = 0;
+ uint32_t total_samples = 0;
+ const double peak = (double)((1 << in_bit_depth) - 1);
+ const unsigned int input_shift = bit_depth - in_bit_depth;
+
+ for (i = 0; i < 3; ++i) {
+ const int w = widths[i];
+ const int h = heights[i];
+ const uint32_t samples = w * h;
+ uint64_t sse;
+ if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (input_shift) {
+ sse = highbd_get_sse_shift(a_planes[i], a_strides[i], b_planes[i],
+ b_strides[i], w, h, input_shift);
+ } else {
+ sse = highbd_get_sse(a_planes[i], a_strides[i], b_planes[i],
+ b_strides[i], w, h);
+ }
+ } else {
+ sse = get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h);
+ }
+ psnr->sse[1 + i] = sse;
+ psnr->samples[1 + i] = samples;
+ psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+ total_sse += sse;
+ total_samples += samples;
+ }
+
+ psnr->sse[0] = total_sse;
+ psnr->samples[0] = total_samples;
+ psnr->psnr[0] =
+ vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+}
+
+#endif // !CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr) {
+ static const double peak = 255.0;
+ const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+ const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+ a->uv_crop_height };
+ const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
+ const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+ const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
+ const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+ int i;
+ uint64_t total_sse = 0;
+ uint32_t total_samples = 0;
+
+ for (i = 0; i < 3; ++i) {
+ const int w = widths[i];
+ const int h = heights[i];
+ const uint32_t samples = w * h;
+ const uint64_t sse =
+ get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h);
+ psnr->sse[1 + i] = sse;
+ psnr->samples[1 + i] = samples;
+ psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+ total_sse += sse;
+ total_samples += samples;
+ }
+
+ psnr->sse[0] = total_sse;
+ psnr->samples[0] = total_samples;
+ psnr->psnr[0] =
+ vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/psnr.h b/media/libvpx/libvpx/vpx_dsp/psnr.h
new file mode 100644
index 0000000000..9ebb64dd52
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/psnr.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PSNR_H_
+#define VPX_VPX_DSP_PSNR_H_
+
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_encoder.h"
+
+#define MAX_PSNR 100.0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vpx_psnr_pkt PSNR_STATS;
+
+// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
+
+/*!\brief Converts SSE to PSNR
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ *
+ * \param[in] samples Number of samples
+ * \param[in] peak Max sample value
+ * \param[in] sse Sum of squared errors
+ */
+double vpx_sse_to_psnr(double samples, double peak, double sse);
+int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+ unsigned int bit_depth, unsigned int in_bit_depth);
+#endif
+void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr);
+
+double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *phvs_y,
+ double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // VPX_VPX_DSP_PSNR_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/psnrhvs.c b/media/libvpx/libvpx/vpx_dsp/psnrhvs.c
new file mode 100644
index 0000000000..d7ec1a429a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/psnrhvs.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * This code was originally written by: Gregory Maxwell, at the Daala
+ * project.
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_dsp/psnr.h"
+
+#if !defined(M_PI)
+#define M_PI (3.141592653589793238462643)
+#endif
+#include <string.h>
+
+static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+ int xstride) {
+ int i, j;
+ (void)xstride;
+ vpx_fdct8x8(x, y, ystride);
+ for (i = 0; i < 8; i++)
+ for (j = 0; j < 8; j++)
+ *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+#if CONFIG_VP9_HIGHBITDEPTH
+static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+ int xstride) {
+ int i, j;
+ (void)xstride;
+ vpx_highbd_fdct8x8(x, y, ystride);
+ for (i = 0; i < 8; i++)
+ for (j = 0; j < 8; j++)
+ *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+#endif
+
+/* Normalized inverse quantization matrix for 8x8 DCT at the point of
+ * transparency. This is not the JPEG based matrix from the paper,
+ this one gives a slightly higher MOS agreement.*/
+static const double csf_y[8][8] = {
+ { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
+ 0.678296995242, 0.466224900598, 0.3265091542 },
+ { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
+ 0.868920337363, 0.61280991668, 0.436405793551 },
+ { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
+ 0.670882927016, 0.501731932449, 0.372504254596 },
+ { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554,
+ 0.48309405692, 0.380429446972, 0.295774038565 },
+ { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676,
+ 0.352889268808, 0.283006984131, 0.226951348204 },
+ { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
+ 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 },
+ { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
+ 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 },
+ { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
+ 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 }
+};
+static const double csf_cb420[8][8] = {
+ { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+ 0.898018824055, 0.74725392039, 0.615105596242 },
+ { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+ 1.17428548929, 0.996404342439, 0.830890433625 },
+ { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+ 0.960060382087, 0.849823426169, 0.731221236837 },
+ { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
+ 0.751437590932, 0.685398513368, 0.608694761374 },
+ { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
+ 0.605503172737, 0.55002013668, 0.495804539034 },
+ { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
+ 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 },
+ { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
+ 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 },
+ { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
+ 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 }
+};
+static const double csf_cr420[8][8] = {
+ { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+ 0.867069376285, 0.721500455585, 0.593906509971 },
+ { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+ 1.13381474809, 0.962064122248, 0.802254508198 },
+ { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
+ 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 },
+ { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+ 0.725539939514, 0.661776842059, 0.587716619023 },
+ { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
+ 0.584635025748, 0.531064164893, 0.478717061273 },
+ { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
+ 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 },
+ { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
+ 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 },
+ { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
+ 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 }
+};
+
+static double convert_score_db(double _score, double _weight, int bit_depth) {
+ int16_t pix_max = 255;
+ assert(_score * _weight >= 0.0);
+ if (bit_depth == 10)
+ pix_max = 1023;
+ else if (bit_depth == 12)
+ pix_max = 4095;
+
+ if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR;
+ return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
+}
+
+static double calc_psnrhvs(const unsigned char *src, int _systride,
+ const unsigned char *dst, int _dystride, double _par,
+ int _w, int _h, int _step, const double _csf[8][8],
+ uint32_t bit_depth, uint32_t _shift) {
+ double ret;
+ const uint8_t *_src8 = src;
+ const uint8_t *_dst8 = dst;
+ const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
+ DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
+ DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]);
+ DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]);
+ DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]);
+ double mask[8][8];
+ int pixels;
+ int x;
+ int y;
+ (void)_par;
+ ret = pixels = 0;
+
+ /*In the PSNR-HVS-M paper[1] the authors describe the construction of
+ their masking table as "we have used the quantization table for the
+ color component Y of JPEG [6] that has been also obtained on the
+ basis of CSF. Note that the values in quantization table JPEG have
+ been normalized and then squared." Their CSF matrix (from PSNR-HVS)
+ was also constructed from the JPEG matrices. I can not find any obvious
+ scheme of normalizing to produce their table, but if I multiply their
+ CSF by 0.3885746225901003 and square the result I get their masking table.
+ I have no idea where this constant comes from, but deviating from it
+ too greatly hurts MOS agreement.
+
+ [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
+ Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
+ of DCT basis functions", CD-ROM Proceedings of the Third
+ International Workshop on Video Processing and Quality Metrics for Consumer
+ Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.
+
+ Suggested in aomedia issue #2363:
+ 0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509)
+ of the old JPEG based matrix from the paper. Since you are not using that,
+ divide by actual maximum coefficient. */
+ for (x = 0; x < 8; x++)
+ for (y = 0; y < 8; y++)
+ mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]);
+ for (y = 0; y < _h - 7; y += _step) {
+ for (x = 0; x < _w - 7; x += _step) {
+ int i;
+ int j;
+ double s_means[4];
+ double d_means[4];
+ double s_vars[4];
+ double d_vars[4];
+ double s_gmean = 0;
+ double d_gmean = 0;
+ double s_gvar = 0;
+ double d_gvar = 0;
+ double s_mask = 0;
+ double d_mask = 0;
+ for (i = 0; i < 4; i++)
+ s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+ if (bit_depth == 8 && _shift == 0) {
+ dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
+ dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
+ } else if (bit_depth == 10 || bit_depth == 12) {
+ dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
+ dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
+ }
+ s_gmean += dct_s[i * 8 + j];
+ d_gmean += dct_d[i * 8 + j];
+ s_means[sub] += dct_s[i * 8 + j];
+ d_means[sub] += dct_d[i * 8 + j];
+ }
+ }
+ s_gmean /= 64.f;
+ d_gmean /= 64.f;
+ for (i = 0; i < 4; i++) s_means[i] /= 16.f;
+ for (i = 0; i < 4; i++) d_means[i] /= 16.f;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+ s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
+ d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
+ s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) *
+ (dct_s[i * 8 + j] - s_means[sub]);
+ d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) *
+ (dct_d[i * 8 + j] - d_means[sub]);
+ }
+ }
+ s_gvar *= 1 / 63.f * 64;
+ d_gvar *= 1 / 63.f * 64;
+ for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16;
+ for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16;
+ if (s_gvar > 0)
+ s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
+ if (d_gvar > 0)
+ d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (bit_depth == 10 || bit_depth == 12) {
+ hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+ hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+ }
+#endif
+ if (bit_depth == 8) {
+ od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+ od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+ }
+ for (i = 0; i < 8; i++)
+ for (j = (i == 0); j < 8; j++)
+ s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
+ for (i = 0; i < 8; i++)
+ for (j = (i == 0); j < 8; j++)
+ d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
+ s_mask = sqrt(s_mask * s_gvar) / 32.f;
+ d_mask = sqrt(d_mask * d_gvar) / 32.f;
+ if (d_mask > s_mask) s_mask = d_mask;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ double err;
+ err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
+ if (i != 0 || j != 0)
+ err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
+ ret += (err * _csf[i][j]) * (err * _csf[i][j]);
+ pixels++;
+ }
+ }
+ }
+ }
+ if (pixels <= 0) return 0;
+ ret /= pixels;
+ return ret;
+}
+
+double vpx_psnrhvs(const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *dest, double *y_psnrhvs,
+ double *u_psnrhvs, double *v_psnrhvs, uint32_t bd,
+ uint32_t in_bd) {
+ double psnrhvs;
+ const double par = 1.0;
+ const int step = 7;
+ uint32_t bd_shift = 0;
+ vpx_clear_system_state();
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+ assert(bd >= in_bd);
+
+ bd_shift = bd - in_bd;
+
+ *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dest->y_buffer,
+ dest->y_stride, par, src->y_crop_width,
+ src->y_crop_height, step, csf_y, bd, bd_shift);
+ *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dest->u_buffer,
+ dest->uv_stride, par, src->uv_crop_width,
+ src->uv_crop_height, step, csf_cb420, bd, bd_shift);
+ *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dest->v_buffer,
+ dest->uv_stride, par, src->uv_crop_width,
+ src->uv_crop_height, step, csf_cr420, bd, bd_shift);
+ psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
+ return convert_score_db(psnrhvs, 1.0, in_bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/quantize.c b/media/libvpx/libvpx/vpx_dsp/quantize.c
new file mode 100644
index 0000000000..7dff8c7a87
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/quantize.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/quantize.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant, uint16_t *eob_ptr) {
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp, eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant) >> 16;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+ if (tmp) eob = 0;
+
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant, uint16_t *eob_ptr) {
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ {
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + round_ptr[0];
+ const int abs_qcoeff = (int)((tmp * quant) >> 16);
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
+ if (abs_qcoeff) eob = 0;
+ }
+
+ *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant, uint16_t *eob_ptr) {
+ const int n_coeffs = 1024;
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp, eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), INT16_MIN,
+ INT16_MAX);
+ tmp = (tmp * quant) >> 15;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
+ if (tmp) eob = 0;
+
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t dequant, uint16_t *eob_ptr) {
+ const int n_coeffs = 1024;
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ {
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+ const int abs_qcoeff = (int)((tmp * quant) >> 15);
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2;
+ if (abs_qcoeff) eob = 0;
+ }
+
+ *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
+ const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+
+ if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+ non_zero_count--;
+ else
+ break;
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (abs_coeff >= zbins[rc != 0]) {
+ int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ quant_shift_ptr[rc != 0]) >>
+ 16; // quantization
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]);
+
+ if (tmp) eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
+ const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+
+ if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+ non_zero_count--;
+ else
+ break;
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (abs_coeff >= zbins[rc != 0]) {
+ const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+ if (abs_qcoeff) eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *scan_order) {
+ const int n_coeffs = 32 * 32;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+ ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ const int16_t *round_ptr = mb_plane->round;
+ const int16_t *quant_ptr = mb_plane->quant;
+ const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+ const int16_t *scan = scan_order->scan;
+
+ int idx = 0;
+ int idx_arr[32 * 32 /* n_coeffs */];
+ int i, eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = scan[idx_arr[i]];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ int tmp;
+ int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+ tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+ quant_shift_ptr[rc != 0]) >>
+ 15;
+
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+#if (VPX_ARCH_X86 || VPX_ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH
+ // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than
+ // truncating with a cast, saturate the value. This is easier to implement
+ // on x86 and preserves the sign of the value.
+ dqcoeff_ptr[rc] =
+ clamp(qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2, INT16_MIN, INT16_MAX);
+#else
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+#endif // VPX_ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH
+
+ if (tmp) eob = idx_arr[i];
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_32x32_c(
+ const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+ const intptr_t n_coeffs = 32 * 32;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+ ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ const int16_t *round_ptr = mb_plane->round;
+ const int16_t *quant_ptr = mb_plane->quant;
+ const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+ const int16_t *scan = scan_order->scan;
+
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = scan[idx_arr[i]];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ if (abs_qcoeff) eob = idx_arr[i];
+ }
+ *eob_ptr = eob + 1;
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/quantize.h b/media/libvpx/libvpx/vpx_dsp/quantize.h
new file mode 100644
index 0000000000..8e138445e2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/quantize.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_QUANTIZE_H_
+#define VPX_VPX_DSP_QUANTIZE_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant, uint16_t *eob_ptr);
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant, uint16_t *eob_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant, uint16_t *eob_ptr);
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t dequant, uint16_t *eob_ptr);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_QUANTIZE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/sad.c b/media/libvpx/libvpx/vpx_dsp/sad.c
new file mode 100644
index 0000000000..619d7aa956
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/sad.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+/* Sum the difference between every corresponding element of the buffers. */
+static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) sad += abs(src_ptr[x] - ref_ptr[x]);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+ return sad;
+}
+
+#define sadMxN(m, n) \
+ unsigned int vpx_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+ } \
+ unsigned int vpx_sad##m##x##n##_avg_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \
+ vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
+ return sad(src_ptr, src_stride, comp_pred, m, m, n); \
+ } \
+ unsigned int vpx_sad_skip_##m##x##n##_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m), \
+ (n / 2)); \
+ }
+
+// Compare |src_ptr| to 4 distinct references in |ref_array[4]|
+#define sadMxNx4D(m, n) \
+ void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) \
+ sad_array[i] = \
+ vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \
+ } \
+ void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i], \
+ 2 * ref_stride, (m), (n / 2)); \
+ } \
+ }
+
+/* clang-format off */
+// 64x64
+sadMxN(64, 64)
+sadMxNx4D(64, 64)
+
+// 64x32
+sadMxN(64, 32)
+sadMxNx4D(64, 32)
+
+// 32x64
+sadMxN(32, 64)
+sadMxNx4D(32, 64)
+
+// 32x32
+sadMxN(32, 32)
+sadMxNx4D(32, 32)
+
+// 32x16
+sadMxN(32, 16)
+sadMxNx4D(32, 16)
+
+// 16x32
+sadMxN(16, 32)
+sadMxNx4D(16, 32)
+
+// 16x16
+sadMxN(16, 16)
+sadMxNx4D(16, 16)
+
+// 16x8
+sadMxN(16, 8)
+sadMxNx4D(16, 8)
+
+// 8x16
+sadMxN(8, 16)
+sadMxNx4D(8, 16)
+
+// 8x8
+sadMxN(8, 8)
+sadMxNx4D(8, 8)
+
+// 8x4
+sadMxN(8, 4)
+sadMxNx4D(8, 4)
+
+// 4x8
+sadMxN(4, 8)
+sadMxNx4D(4, 8)
+
+// 4x4
+sadMxN(4, 4)
+sadMxNx4D(4, 4)
+/* clang-format on */
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ static INLINE
+ unsigned int highbd_sad(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride, int width,
+ int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
+
+ src += src_stride;
+ ref_ptr += ref_stride;
+ }
+ return sad;
+}
+
+static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
+
+ src += src_stride;
+ ref_ptr += ref_stride;
+ }
+ return sad;
+}
+
+#define highbd_sadMxN(m, n) \
+ unsigned int vpx_highbd_sad##m##x##n##_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return highbd_sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+ } \
+ unsigned int vpx_highbd_sad##m##x##n##_avg_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]); \
+ vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \
+ n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride); \
+ return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \
+ } \
+ unsigned int vpx_highbd_sad_skip_##m##x##n##_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * \
+ highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
+ }
+
+#define highbd_sadMxNx4D(m, n) \
+ void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \
+ ref_array[i], ref_stride); \
+ } \
+ } \
+ void vpx_highbd_sad_skip_##m##x##n##x4d_c( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c( \
+ src, src_stride, ref_array[i], ref_stride); \
+ } \
+ }
+
+/* clang-format off */
+// 64x64
+highbd_sadMxN(64, 64)
+highbd_sadMxNx4D(64, 64)
+
+// 64x32
+highbd_sadMxN(64, 32)
+highbd_sadMxNx4D(64, 32)
+
+// 32x64
+highbd_sadMxN(32, 64)
+highbd_sadMxNx4D(32, 64)
+
+// 32x32
+highbd_sadMxN(32, 32)
+highbd_sadMxNx4D(32, 32)
+
+// 32x16
+highbd_sadMxN(32, 16)
+highbd_sadMxNx4D(32, 16)
+
+// 16x32
+highbd_sadMxN(16, 32)
+highbd_sadMxNx4D(16, 32)
+
+// 16x16
+highbd_sadMxN(16, 16)
+highbd_sadMxNx4D(16, 16)
+
+// 16x8
+highbd_sadMxN(16, 8)
+highbd_sadMxNx4D(16, 8)
+
+// 8x16
+highbd_sadMxN(8, 16)
+highbd_sadMxNx4D(8, 16)
+
+// 8x8
+highbd_sadMxN(8, 8)
+highbd_sadMxNx4D(8, 8)
+
+// 8x4
+highbd_sadMxN(8, 4)
+highbd_sadMxNx4D(8, 4)
+
+// 4x8
+highbd_sadMxN(4, 8)
+highbd_sadMxNx4D(4, 8)
+
+// 4x4
+highbd_sadMxN(4, 4)
+highbd_sadMxNx4D(4, 4)
+/* clang-format on */
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/skin_detection.c b/media/libvpx/libvpx/vpx_dsp/skin_detection.c
new file mode 100644
index 0000000000..bbbb6c3a17
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/skin_detection.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/skin_detection.h"
+
+#define MODEL_MODE 1
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[5][2] = { { 7463, 9614 },
+ { 6400, 10240 },
+ { 7040, 10240 },
+ { 8320, 9280 },
+ { 6800, 9614 } };
+static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16
+static const int skin_threshold[6] = { 1570636, 1400000, 800000,
+ 800000, 800000, 800000 }; // q18
+// Thresholds on luminance.
+static const int y_low = 40;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int vpx_evaluate_skin_color_difference(const int cb, const int cr,
+ const int idx) {
+ const int cb_q6 = cb << 6;
+ const int cr_q6 = cr << 6;
+ const int cb_diff_q12 =
+ (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
+ const int cbcr_diff_q12 =
+ (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
+ const int cr_diff_q12 =
+ (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
+ const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+ const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+ const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+ const int skin_diff =
+ skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
+ skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
+ return skin_diff;
+}
+
+// Checks if the input yCbCr values corresponds to skin color.
+int vpx_skin_pixel(const int y, const int cb, const int cr, int motion) {
+ if (y < y_low || y > y_high) {
+ return 0;
+ } else if (MODEL_MODE == 0) {
+ return (vpx_evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
+ } else {
+ int i = 0;
+ // Exit on grey.
+ if (cb == 128 && cr == 128) return 0;
+ // Exit on very strong cb.
+ if (cb > 150 && cr < 110) return 0;
+ for (; i < 5; ++i) {
+ int skin_color_diff = vpx_evaluate_skin_color_difference(cb, cr, i);
+ if (skin_color_diff < skin_threshold[i + 1]) {
+ if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
+ return 0;
+ } else if (motion == 0 &&
+ skin_color_diff > (skin_threshold[i + 1] >> 1)) {
+ return 0;
+ } else {
+ return 1;
+ }
+ }
+ // Exit if difference is much large than the threshold.
+ if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+ return 0;
+ }
+ }
+ return 0;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/skin_detection.h b/media/libvpx/libvpx/vpx_dsp/skin_detection.h
new file mode 100644
index 0000000000..91640c33d5
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/skin_detection.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_SKIN_DETECTION_H_
+#define VPX_VPX_DSP_SKIN_DETECTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int vpx_skin_pixel(const int y, const int cb, const int cr, int motion);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_SKIN_DETECTION_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ssim.c b/media/libvpx/libvpx/vpx_dsp/ssim.c
new file mode 100644
index 0000000000..7c3c31bad8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ssim.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r,
+ uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+ uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 16; i++, s += sp, r += rp) {
+ for (j = 0; j < 16; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+ uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+ int rp, uint32_t *sum_s, uint32_t *sum_r,
+ uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+ uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2
+static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+ uint32_t sum_sq_r, uint32_t sum_sxr, int count,
+ uint32_t bd) {
+ double ssim_n, ssim_d;
+ int64_t c1, c2;
+ if (bd == 8) {
+ // scale the constants by number of pixels
+ c1 = (cc1 * count * count) >> 12;
+ c2 = (cc2 * count * count) >> 12;
+ } else if (bd == 10) {
+ c1 = (cc1_10 * count * count) >> 12;
+ c2 = (cc2_10 * count * count) >> 12;
+ } else if (bd == 12) {
+ c1 = (cc1_12 * count * count) >> 12;
+ c2 = (cc2_12 * count * count) >> 12;
+ } else {
+ c1 = c2 = 0;
+ assert(0);
+ }
+
+ ssim_n = (2.0 * sum_s * sum_r + c1) *
+ (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2);
+
+ ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) *
+ ((double)count * sum_sq_s - (double)sum_s * sum_s +
+ (double)count * sum_sq_r - (double)sum_r * sum_r + c2);
+
+ return ssim_n / ssim_d;
+}
+
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
+ uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ vpx_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr);
+ return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+ int rp, uint32_t bd, uint32_t shift) {
+ uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ vpx_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr);
+ return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+ sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+static double vpx_ssim2(const uint8_t *img1, const uint8_t *img2,
+ int stride_img1, int stride_img2, int width,
+ int height) {
+ int i, j;
+ int samples = 0;
+ double ssim_total = 0;
+
+ // sample point start with each 4x4 location
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
+ double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+ int stride_img1, int stride_img2, int width,
+ int height, uint32_t bd, uint32_t shift) {
+ int i, j;
+ int samples = 0;
+ double ssim_total = 0;
+
+ // sample point start with each 4x4 location
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
+ double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+ CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+ shift);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight) {
+ double a, b, c;
+ double ssimv;
+
+ a = vpx_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
+ dest->y_stride, source->y_crop_width, source->y_crop_height);
+
+ b = vpx_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width, source->uv_crop_height);
+
+ c = vpx_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width, source->uv_crop_height);
+
+ ssimv = a * .8 + .1 * (b + c);
+
+ *weight = 1;
+
+ return ssimv;
+}
+
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+static double ssimv_similarity(const Ssimv *sv, int64_t n) {
+ // Scale the constants by number of pixels.
+ const int64_t c1 = (cc1 * n * n) >> 12;
+ const int64_t c2 = (cc2 * n * n) >> 12;
+
+ const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+ (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+ // Since these variables are unsigned sums, convert to double so
+ // math is done in double arithmetic.
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+ (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+ n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+ return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side. check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
+ // Scale the constants by number of pixels.
+ const int64_t c1 = (cc1 * n * n) >> 12;
+ const int64_t c2 = (cc2 * n * n) >> 12;
+
+ const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+ const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+ // Since these variables are unsigned, sums convert to double so
+ // math is done in double arithmetic.
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+ (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+ n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+ return l * v;
+}
+static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, Ssimv *sv) {
+ vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
+ &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
+}
+
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, int width, int height, Ssimv *sv2,
+ Metrics *m, int do_inconsistency) {
+ double dssim_total = 0;
+ double ssim_total = 0;
+ double ssim2_total = 0;
+ double inconsistency_total = 0;
+ int i, j;
+ int c = 0;
+ double norm;
+ double old_ssim_total = 0;
+ vpx_clear_system_state();
+ // We can sample points as frequently as we like start with 1 per 4x4.
+ for (i = 0; i < height;
+ i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+ for (j = 0; j < width; j += 4, ++c) {
+ Ssimv sv = { 0, 0, 0, 0, 0, 0 };
+ double ssim;
+ double ssim2;
+ double dssim;
+ uint32_t var_new;
+ uint32_t var_old;
+ uint32_t mean_new;
+ uint32_t mean_old;
+ double ssim_new;
+ double ssim_old;
+
+ // Not sure there's a great way to handle the edge pixels
+ // in ssim when using a window. Seems biased against edge pixels
+ // however you handle this. This uses only samples that are
+ // fully in the frame.
+ if (j + 8 <= width && i + 8 <= height) {
+ ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+ }
+
+ ssim = ssimv_similarity(&sv, 64);
+ ssim2 = ssimv_similarity2(&sv, 64);
+
+ sv.ssim = ssim2;
+
+ // dssim is calculated to use as an actual error metric and
+ // is scaled up to the same range as sum square error.
+ // Since we are subsampling every 16th point maybe this should be
+ // *16 ?
+ dssim = 255 * 255 * (1 - ssim2) / 2;
+
+ // Here I introduce a new error metric: consistency-weighted
+ // SSIM-inconsistency. This metric isolates frames where the
+ // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+ // sharper or blurrier than the others. Higher values indicate a
+ // temporally inconsistent SSIM. There are two ideas at work:
+ //
+ // 1) 'SSIM-inconsistency': the total inconsistency value
+ // reflects how much SSIM values are changing between this
+ // source / reference frame pair and the previous pair.
+ //
+ // 2) 'consistency-weighted': weights de-emphasize areas in the
+ // frame where the scene content has changed. Changes in scene
+ // content are detected via changes in local variance and local
+ // mean.
+ //
+ // Thus the overall measure reflects how inconsistent the SSIM
+ // values are, over consistent regions of the frame.
+ //
+ // The metric has three terms:
+ //
+ // term 1 -> uses change in scene Variance to weight error score
+ // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+ // larger changes from one frame to the next mean we care
+ // less about consistency.
+ //
+ // term 2 -> uses change in local scene luminance to weight error
+ // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+ // larger changes from one frame to the next mean we care
+ // less about consistency.
+ //
+ // term3 -> measures inconsistency in ssim scores between frames
+ // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+ //
+ // This term compares the ssim score for the same location in 2
+ // subsequent frames.
+ var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+ var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+ mean_new = sv.sum_s;
+ mean_old = sv2[c].sum_s;
+ ssim_new = sv.ssim;
+ ssim_old = sv2[c].ssim;
+
+ if (do_inconsistency) {
+ // We do the metric once for every 4x4 block in the image. Since
+ // we are scaling the error to SSE for use in a psnr calculation
+ // 1.0 = 4x4x255x255 the worst error we can possibly have.
+ static const double kScaling = 4. * 4 * 255 * 255;
+
+ // The constants have to be non 0 to avoid potential divide by 0
+ // issues other than that they affect kind of a weighting between
+ // the terms. No testing of what the right terms should be has been
+ // done.
+ static const double c1 = 1, c2 = 1, c3 = 1;
+
+ // This measures how much consistent variance is in two consecutive
+ // source frames. 1.0 means they have exactly the same variance.
+ const double variance_term =
+ (2.0 * var_old * var_new + c1) /
+ (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+ // This measures how consistent the local mean are between two
+ // consecutive frames. 1.0 means they have exactly the same mean.
+ const double mean_term =
+ (2.0 * mean_old * mean_new + c2) /
+ (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+ // This measures how consistent the ssims of two
+ // consecutive frames is. 1.0 means they are exactly the same.
+ double ssim_term =
+ pow((2.0 * ssim_old * ssim_new + c3) /
+ (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+ 5);
+
+ double this_inconsistency;
+
+ // Floating point math sometimes makes this > 1 by a tiny bit.
+ // We want the metric to scale between 0 and 1.0 so we can convert
+ // it to an snr scaled value.
+ if (ssim_term > 1) ssim_term = 1;
+
+ // This converts the consistency metric to an inconsistency metric
+ // ( so we can scale it like psnr to something like sum square error.
+ // The reason for the variance and mean terms is the assumption that
+ // if there are big changes in the source we shouldn't penalize
+ // inconsistency in ssim scores a bit less as it will be less visible
+ // to the user.
+ this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+ this_inconsistency *= kScaling;
+ inconsistency_total += this_inconsistency;
+ }
+ sv2[c] = sv;
+ ssim_total += ssim;
+ ssim2_total += ssim2;
+ dssim_total += dssim;
+
+ old_ssim_total += ssim_old;
+ }
+ old_ssim_total += 0;
+ }
+
+ norm = 1. / (width / 4) / (height / 4);
+ ssim_total *= norm;
+ ssim2_total *= norm;
+ m->ssim2 = ssim2_total;
+ m->ssim = ssim_total;
+ if (old_ssim_total == 0) inconsistency_total = 0;
+
+ m->ssimc = inconsistency_total;
+
+ m->dssim = dssim_total;
+ return inconsistency_total;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight,
+ uint32_t bd, uint32_t in_bd) {
+ double a, b, c;
+ double ssimv;
+ uint32_t shift = 0;
+
+ assert(bd >= in_bd);
+ shift = bd - in_bd;
+
+ a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
+ dest->y_stride, source->y_crop_width,
+ source->y_crop_height, in_bd, shift);
+
+ b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, shift);
+
+ c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, shift);
+
+ ssimv = a * .8 + .1 * (b + c);
+
+ *weight = 1;
+
+ return ssimv;
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/ssim.h b/media/libvpx/libvpx/vpx_dsp/ssim.h
new file mode 100644
index 0000000000..c382237fc6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ssim.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_SSIM_H_
+#define VPX_VPX_DSP_SSIM_H_
+
+#define MAX_SSIM_DB 100.0;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./vpx_config.h"
+#include "vpx_scale/yv12config.h"
+
+// metrics used for calculating ssim, ssim2, dssim, and ssimc
+typedef struct {
+ // source sum ( over 8x8 region )
+ uint32_t sum_s;
+
+ // reference sum (over 8x8 region )
+ uint32_t sum_r;
+
+ // source sum squared ( over 8x8 region )
+ uint32_t sum_sq_s;
+
+ // reference sum squared (over 8x8 region )
+ uint32_t sum_sq_r;
+
+ // sum of source times reference (over 8x8 region)
+ uint32_t sum_sxr;
+
+ // calculated ssim score between source and reference
+ double ssim;
+} Ssimv;
+
+// metrics collected on a frame basis
+typedef struct {
+ // ssim consistency error metric ( see code for explanation )
+ double ssimc;
+
+ // standard ssim
+ double ssim;
+
+ // revised ssim ( see code for explanation)
+ double ssim2;
+
+ // ssim restated as an error metric like sse
+ double dssim;
+
+ // dssim converted to decibels
+ double dssimd;
+
+ // ssimc converted to decibels
+ double ssimcd;
+} Metrics;
+
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, int width, int height, Ssimv *sv2,
+ Metrics *m, int do_inconsistency);
+
+double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight);
+
+double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+ double *ssim_u, double *ssim_v, uint32_t bd,
+ uint32_t in_bd);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight,
+ uint32_t bd, uint32_t in_bd);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_SSIM_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/subtract.c b/media/libvpx/libvpx/vpx_dsp/subtract.c
new file mode 100644
index 0000000000..45c819e67a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/subtract.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ int r, c;
+
+ for (r = 0; r < rows; r++) {
+ for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+
+ diff_ptr += diff_stride;
+ pred_ptr += pred_stride;
+ src_ptr += src_stride;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src8_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred8_ptr,
+ ptrdiff_t pred_stride, int bd) {
+ int r, c;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
+ (void)bd;
+
+ for (r = 0; r < rows; r++) {
+ for (c = 0; c < cols; c++) {
+ diff_ptr[c] = src[c] - pred[c];
+ }
+
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/sum_squares.c b/media/libvpx/libvpx/vpx_dsp/sum_squares.c
new file mode 100644
index 0000000000..b80cd588e4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/sum_squares.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size) {
+ int r, c;
+ uint64_t ss = 0;
+
+ for (r = 0; r < size; r++) {
+ for (c = 0; c < size; c++) {
+ const int16_t v = src[c];
+ ss += v * v;
+ }
+ src += stride;
+ }
+
+ return ss;
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/txfm_common.h b/media/libvpx/libvpx/vpx_dsp/txfm_common.h
new file mode 100644
index 0000000000..25f4fdb327
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/txfm_common.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_TXFM_COMMON_H_
+#define VPX_VPX_DSP_TXFM_COMMON_H_
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Constants and Macros used by all idct/dct functions
+#define DCT_CONST_BITS 14
+#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
+
+#define UNIT_QUANT_SHIFT 2
+#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
+
+// Constants:
+// for (int i = 1; i< 32; ++i)
+// printf("static const int cospi_%d_64 = %.0f;\n", i,
+// round(16384 * cos(i*M_PI/64)));
+// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
+static const tran_coef_t cospi_1_64 = 16364;
+static const tran_coef_t cospi_2_64 = 16305;
+static const tran_coef_t cospi_3_64 = 16207;
+static const tran_coef_t cospi_4_64 = 16069;
+static const tran_coef_t cospi_5_64 = 15893;
+static const tran_coef_t cospi_6_64 = 15679;
+static const tran_coef_t cospi_7_64 = 15426;
+static const tran_coef_t cospi_8_64 = 15137;
+static const tran_coef_t cospi_9_64 = 14811;
+static const tran_coef_t cospi_10_64 = 14449;
+static const tran_coef_t cospi_11_64 = 14053;
+static const tran_coef_t cospi_12_64 = 13623;
+static const tran_coef_t cospi_13_64 = 13160;
+static const tran_coef_t cospi_14_64 = 12665;
+static const tran_coef_t cospi_15_64 = 12140;
+static const tran_coef_t cospi_16_64 = 11585;
+static const tran_coef_t cospi_17_64 = 11003;
+static const tran_coef_t cospi_18_64 = 10394;
+static const tran_coef_t cospi_19_64 = 9760;
+static const tran_coef_t cospi_20_64 = 9102;
+static const tran_coef_t cospi_21_64 = 8423;
+static const tran_coef_t cospi_22_64 = 7723;
+static const tran_coef_t cospi_23_64 = 7005;
+static const tran_coef_t cospi_24_64 = 6270;
+static const tran_coef_t cospi_25_64 = 5520;
+static const tran_coef_t cospi_26_64 = 4756;
+static const tran_coef_t cospi_27_64 = 3981;
+static const tran_coef_t cospi_28_64 = 3196;
+static const tran_coef_t cospi_29_64 = 2404;
+static const tran_coef_t cospi_30_64 = 1606;
+static const tran_coef_t cospi_31_64 = 804;
+
+// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const tran_coef_t sinpi_1_9 = 5283;
+static const tran_coef_t sinpi_2_9 = 9929;
+static const tran_coef_t sinpi_3_9 = 13377;
+static const tran_coef_t sinpi_4_9 = 15212;
+
+#endif // VPX_VPX_DSP_TXFM_COMMON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/variance.c b/media/libvpx/libvpx/vpx_dsp/variance.c
new file mode 100644
index 0000000000..ce1e8382b9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/variance.c
@@ -0,0 +1,566 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+uint32_t vpx_get4x4sse_cs_c(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ int distortion = 0;
+ int r, c;
+
+ for (r = 0; r < 4; ++r) {
+ for (c = 0; c < 4; ++c) {
+ int diff = src_ptr[c] - ref_ptr[c];
+ distortion += diff * diff;
+ }
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+
+ return distortion;
+}
+
+uint32_t vpx_get_mb_ss_c(const int16_t *src_ptr) {
+ unsigned int i, sum = 0;
+
+ for (i = 0; i < 256; ++i) {
+ sum += src_ptr[i] * src_ptr[i];
+ }
+
+ return sum;
+}
+
+static void variance(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride, int w, int h,
+ uint32_t *sse, int *sum) {
+ int i, j;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int diff = src_ptr[j] - ref_ptr[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line,
+ int pixel_step, unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ ref_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+
+ ++src_ptr;
+ }
+
+ src_ptr += src_pixels_per_line - output_width;
+ ref_ptr += output_width;
+ }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ ref_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+ ++src_ptr;
+ }
+
+ src_ptr += src_pixels_per_line - output_width;
+ ref_ptr += output_width;
+ }
+}
+
+#define VAR(W, H) \
+ uint32_t vpx_variance##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+#define SUBPIX_VAR(W, H) \
+ uint32_t vpx_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
+ W, bilinear_filters[x_offset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ return vpx_variance##W##x##H##_c(temp2, W, ref_ptr, ref_stride, sse); \
+ }
+
+#define SUBPIX_AVG_VAR(W, H) \
+ uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
+ W, bilinear_filters[x_offset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \
+ \
+ return vpx_variance##W##x##H##_c(temp3, W, ref_ptr, ref_stride, sse); \
+ }
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H) \
+ void vpx_get##W##x##H##var_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \
+ }
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H) \
+ uint32_t vpx_mse##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+ }
+
+/* All three forms of the variance are available in the same sizes. */
+#define VARIANCES(W, H) \
+ VAR(W, H) \
+ SUBPIX_VAR(W, H) \
+ SUBPIX_AVG_VAR(W, H)
+
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ int i, j;
+
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_variance64(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride, int w,
+ int h, uint64_t *sse, int64_t *sum) {
+ int i, j;
+
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int diff = src_ptr[j] - ref_ptr[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+static void highbd_8_variance(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+ &sum_long);
+ *sse = (uint32_t)sse_long;
+ *sum = (int)sum_long;
+}
+
+static void highbd_10_variance(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+ &sum_long);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+ &sum_long);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H) \
+ uint32_t vpx_highbd_8_variance##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_variance##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_variance##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HIGHBD_GET_VAR(S) \
+ void vpx_highbd_8_get##S##x##S##var_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+ sum); \
+ } \
+ \
+ void vpx_highbd_10_get##S##x##S##var_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+ sum); \
+ } \
+ \
+ void vpx_highbd_12_get##S##x##S##var_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+ sum); \
+ }
+
+#define HIGHBD_MSE(W, H) \
+ uint32_t vpx_highbd_8_mse##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_10_mse##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_12_mse##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse; \
+ }
+
+static void highbd_var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr8, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ output_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+
+ ++src_ptr;
+ }
+
+ // Next row...
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+static void highbd_var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, unsigned int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ output_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+ ++src_ptr;
+ }
+
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+ uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ ref_ptr, ref_stride, sse); \
+ }
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+ uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \
+ temp2, W); \
+ \
+ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \
+ temp2, W); \
+ \
+ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \
+ temp2, W); \
+ \
+ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ ref_ptr, ref_stride, sse); \
+ }
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+ HIGHBD_VAR(W, H) \
+ HIGHBD_SUBPIX_VAR(W, H) \
+ HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
+ int i, j;
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/variance.h b/media/libvpx/libvpx/vpx_dsp/variance.h
new file mode 100644
index 0000000000..ccdb2f90ba
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/variance.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_VARIANCE_H_
+#define VPX_VPX_DSP_VARIANCE_H_
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+#define FILTER_WEIGHT 128
+
+typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride);
+
+typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred);
+
+typedef void (*vp8_copy32xn_fn_t)(const uint8_t *src_ptr, int src_stride,
+ uint8_t *ref_ptr, int ref_stride, int n);
+
+typedef void (*vpx_sad_multi_fn_t)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sad_array);
+
+typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const b_array[],
+ int ref_stride, unsigned int *sad_array);
+
+typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, unsigned int *sse);
+
+typedef unsigned int (*vpx_subpixvariance_fn_t)(
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+ const uint8_t *second_pred);
+
+#if CONFIG_VP8
+typedef struct variance_vtable {
+ vpx_sad_fn_t sdf;
+ vpx_variance_fn_t vf;
+ vpx_subpixvariance_fn_t svf;
+ vpx_sad_multi_d_fn_t sdx4df;
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+ vp8_copy32xn_fn_t copymem;
+#endif
+} vp8_variance_fn_ptr_t;
+#endif // CONFIG_VP8
+
+#if CONFIG_VP9
+typedef struct vp9_variance_vtable {
+ vpx_sad_fn_t sdf;
+ // Same as normal sad, but downsample the rows by a factor of 2.
+ vpx_sad_fn_t sdsf;
+ vpx_sad_avg_fn_t sdaf;
+ vpx_variance_fn_t vf;
+ vpx_subpixvariance_fn_t svf;
+ vpx_subp_avg_variance_fn_t svaf;
+ vpx_sad_multi_d_fn_t sdx4df;
+ // Same as sadx4, but downsample the rows by a factor of 2.
+ vpx_sad_multi_d_fn_t sdsx4df;
+} vp9_variance_fn_ptr_t;
+#endif // CONFIG_VP9
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_VARIANCE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c
new file mode 100644
index 0000000000..e55a963f9d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c
@@ -0,0 +1,537 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = ROUND_POWER_OF_TWO(
+ dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+ dst[y * dst_stride] +
+ clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
+ 1);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ (void)y0_q4;
+ (void)y_step_q4;
+ convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+ h);
+}
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ (void)y0_q4;
+ (void)y_step_q4;
+ convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ w, h);
+}
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ (void)x0_q4;
+ (void)x_step_q4;
+ convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+ h);
+}
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ (void)x0_q4;
+ (void)x_step_q4;
+ convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+ w, h);
+}
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ uint8_t temp[64 * 135];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+ filter, x0_q4, x_step_q4, w, intermediate_height);
+ convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+ y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Fixed size intermediate buffer places limits on parameters.
+ DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+ assert(w <= 64);
+ assert(h <= 64);
+
+ vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, h);
+ vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
+}
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ int r;
+
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ for (r = h; r > 0; --r) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ int x, y;
+
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h);
+}
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h, int bd) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h, int bd) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = ROUND_POWER_OF_TWO(
+ dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+ 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h, int bd) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h, int bd) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+ dst[y * dst_stride] +
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+ 1);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ uint16_t temp[64 * 135];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+ temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height, bd);
+ highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+ filter, y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
+ (void)x0_q4;
+ (void)x_step_q4;
+
+ highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
+ y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)x0_q4;
+ (void)x_step_q4;
+
+ highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
+ y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
+ highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
+ // Fixed size intermediate buffer places limits on parameters.
+ DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
+ assert(w <= 64);
+ assert(h <= 64);
+
+ vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h, bd);
+ vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h,
+ bd);
+}
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
+ int r;
+
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ (void)bd;
+
+ for (r = h; r > 0; --r) {
+ memcpy(dst, src, w * sizeof(uint16_t));
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
+ int x, y;
+
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ (void)bd;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h
new file mode 100644
index 0000000000..d5793e17ad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_VPX_DSP_VPX_CONVOLVE_H_
+#define VPX_VPX_DSP_VPX_CONVOLVE_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_VPX_CONVOLVE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
new file mode 100644
index 0000000000..4368b77f38
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
@@ -0,0 +1,471 @@
+##
+## Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+DSP_SRCS-yes += vpx_dsp.mk
+DSP_SRCS-yes += vpx_dsp_common.h
+
+DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
+
+DSP_SRCS-$(HAVE_AVX2) += x86/bitdepth_conversion_avx2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2.h
+# This file is included in libs.mk. Including it here would cause it to be
+# compiled into an object. Even as an empty file, this would create an
+# executable section on the stack.
+#DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2$(ASM)
+
+# bit reader
+DSP_SRCS-yes += prob.h
+DSP_SRCS-yes += prob.c
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes += bitwriter.h
+DSP_SRCS-yes += bitwriter.c
+DSP_SRCS-yes += bitwriter_buffer.c
+DSP_SRCS-yes += bitwriter_buffer.h
+DSP_SRCS-yes += psnr.c
+DSP_SRCS-yes += psnr.h
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
+endif
+
+ifeq ($(CONFIG_DECODERS),yes)
+DSP_SRCS-yes += bitreader.h
+DSP_SRCS-yes += bitreader.c
+DSP_SRCS-yes += bitreader_buffer.c
+DSP_SRCS-yes += bitreader_buffer.h
+endif
+
+# intra predictions
+DSP_SRCS-yes += intrapred.c
+
+DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
+DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c
+endif # CONFIG_VP9_HIGHBITDEPTH
+
+ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
+DSP_SRCS-yes += add_noise.c
+DSP_SRCS-yes += deblock.c
+DSP_SRCS-yes += postproc.h
+DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
+DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c
+DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/post_proc_sse2.c
+DSP_SRCS-$(HAVE_VSX) += ppc/deblock_vsx.c
+endif # CONFIG_POSTPROC
+
+DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
+DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
+DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/intrapred_lsx.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred4_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c
+
+DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c
+
+DSP_SRCS-yes += vpx_filter.h
+ifeq ($(CONFIG_VP9),yes)
+# interpolation filters
+DSP_SRCS-yes += vpx_convolve.c
+DSP_SRCS-yes += vpx_convolve.h
+
+DSP_SRCS-$(VPX_ARCH_X86)$(VPX_ARCH_X86_64) += x86/convolve.h
+
+DSP_SRCS-$(HAVE_SSE2) += x86/convolve_sse2.h
+DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h
+DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_4t_intrin_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm
+DSP_SRCS-$(HAVE_AVX2) += x86/vpx_subpixel_8t_intrin_avx2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_neon.c
+endif
+
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/vpx_scaled_convolve8_neon.c
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.c
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.h
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
+DSP_SRCS-yes += arm/vpx_convolve8_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+endif # HAVE_NEON
+endif # HAVE_NEON_ASM
+
+# common (msa)
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h
+DSP_SRCS-$(HAVE_MMI) += mips/vpx_convolve8_mmi.c
+
+# common (dspr2)
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve_common_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_vert_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c
+
+DSP_SRCS-$(HAVE_VSX) += ppc/vpx_convolve_vsx.c
+
+# common (lsx)
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_horiz_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_vert_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_avg_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_copy_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h
+
+# loop filters
+DSP_SRCS-yes += loopfilter.c
+
+DSP_SRCS-$(HAVE_SSE2) += x86/loopfilter_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM)
+DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM)
+DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM)
+else
+DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c
+endif # HAVE_NEON_ASM
+
+DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_msa.h
+DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_16_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_4_msa.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_macros_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_masks_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c
+
+DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_lsx.h
+DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_16_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_8_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_4_lsx.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_loopfilter_neon.c
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c
+endif # CONFIG_VP9_HIGHBITDEPTH
+endif # CONFIG_VP9
+
+DSP_SRCS-yes += txfm_common.h
+DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h
+DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h
+DSP_SRCS-$(HAVE_LSX) += loongarch/txfm_macros_lsx.h
+# forward transform
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+DSP_SRCS-yes += fwd_txfm.c
+DSP_SRCS-yes += fwd_txfm.h
+DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h
+ifeq ($(VPX_ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm
+endif
+DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
+DSP_SRCS-$(HAVE_NEON) += arm/fdct4x4_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct8x8_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c
+DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.h
+DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
+DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_dct32x32_lsx.c
+endif # !CONFIG_VP9_HIGHBITDEPTH
+
+DSP_SRCS-$(HAVE_VSX) += ppc/fdct32x32_vsx.c
+endif # CONFIG_VP9_ENCODER
+
+# inverse transform
+ifeq ($(CONFIG_VP9),yes)
+DSP_SRCS-yes += inv_txfm.h
+DSP_SRCS-yes += inv_txfm.c
+DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/inv_txfm_avx2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.h
+DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c
+
+DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
+
+DSP_SRCS-$(HAVE_VSX) += ppc/inv_txfm_vsx.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/idct8x8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/idct16x16_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/idct32x32_msa.c
+
+DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+
+DSP_SRCS-$(HAVE_LSX) += loongarch/idct32x32_lsx.c
+else # CONFIG_VP9_HIGHBITDEPTH
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct4x4_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct8x8_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct16x16_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_34_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_135_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_1024_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct_neon.h
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_inv_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct16x16_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct32x32_add_sse4.c
+endif # !CONFIG_VP9_HIGHBITDEPTH
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/idct_neon$(ASM)
+DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
+else
+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
+endif # HAVE_NEON_ASM
+DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c
+
+endif # CONFIG_VP9
+
+# quantization
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+DSP_SRCS-yes += quantize.c
+DSP_SRCS-yes += quantize.h
+
+DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.h
+DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h
+DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c
+DSP_SRCS-$(HAVE_AVX2) += x86/quantize_avx2.c
+DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c
+DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_intrin_lsx.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_quantize_intrin_avx2.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_quantize_neon.c
+endif
+
+# avg
+DSP_SRCS-yes += avg.c
+DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c
+DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_hadamard_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_neon.c
+endif
+DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/avg_lsx.c
+ifeq ($(VPX_ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
+endif
+DSP_SRCS-$(HAVE_VSX) += ppc/hadamard_vsx.c
+
+endif # CONFIG_VP9_ENCODER
+
+# skin detection
+DSP_SRCS-yes += skin_detection.h
+DSP_SRCS-yes += skin_detection.c
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes += sad.c
+DSP_SRCS-yes += subtract.c
+DSP_SRCS-yes += sum_squares.c
+DSP_SRCS-$(HAVE_NEON) += arm/sum_squares_neon.c
+DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c
+DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c
+
+DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c
+
+DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c
+
+DSP_SRCS-$(HAVE_LSX) += loongarch/sad_lsx.c
+
+DSP_SRCS-$(HAVE_MMI) += mips/sad_mmi.c
+DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c
+
+DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/subtract_avx2.c
+DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
+
+DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm
+
+DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c
+DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c
+
+DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
+endif # CONFIG_VP9_HIGHBITDEPTH
+
+endif # CONFIG_ENCODERS
+
+ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
+DSP_SRCS-yes += variance.c
+DSP_SRCS-yes += variance.h
+
+DSP_SRCS-$(HAVE_NEON) += arm/avg_pred_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
+
+DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c
+
+DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.h
+DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/sub_pixel_variance_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/avg_pred_lsx.c
+
+DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c
+
+DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
+DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
+DSP_SRCS-$(HAVE_VSX) += ppc/variance_vsx.c
+
+ifeq ($(VPX_ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm
+endif # VPX_ARCH_X86_64
+
+DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c
+endif # CONFIG_VP9_HIGHBITDEPTH
+endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
+# Neon utilities
+DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/sum_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/vpx_convolve8_neon.h
+
+# PPC VSX utilities
+DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h
+DSP_SRCS-$(HAVE_VSX) += ppc/txfm_common_vsx.h
+DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h
+DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h
+
+# X86 utilities
+DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
+
+# LSX utilities
+DSP_SRCS-$(HAVE_LSX) += loongarch/bitdepth_conversion_lsx.h
+
+DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
+
+DSP_SRCS-yes += vpx_dsp_rtcd.c
+DSP_SRCS-yes += vpx_dsp_rtcd_defs.pl
+
+$(eval $(call rtcd_h_template,vpx_dsp_rtcd,vpx_dsp/vpx_dsp_rtcd_defs.pl))
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h
new file mode 100644
index 0000000000..2de4495465
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_VPX_DSP_COMMON_H_
+#define VPX_VPX_DSP_VPX_DSP_COMMON_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
+#define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#define VPX_SWAP(type, a, b) \
+ do { \
+ type c = (b); \
+ (b) = a; \
+ (a) = c; \
+ } while (0)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Note:
+// tran_low_t is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+// Note:
+// tran_low_t is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+typedef int16_t tran_coef_t;
+
+static INLINE uint8_t clip_pixel(int val) {
+ return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+
+static INLINE int clamp(int value, int low, int high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE double fclamp(double value, double low, double high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE int64_t lclamp(int64_t value, int64_t low, int64_t high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
+ switch (bd) {
+ case 8:
+ default: return (uint16_t)clamp(val, 0, 255);
+ case 10: return (uint16_t)clamp(val, 0, 1023);
+ case 12: return (uint16_t)clamp(val, 0, 4095);
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_VPX_DSP_COMMON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
new file mode 100644
index 0000000000..030c456d39
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+void vpx_dsp_rtcd() { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
new file mode 100644
index 0000000000..cae4ca8116
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -0,0 +1,1823 @@
+##
+## Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+sub vpx_dsp_forward_decls() {
+print <<EOF
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
+
+EOF
+}
+forward_decls qw/vpx_dsp_forward_decls/;
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+ $mmx_x86_64 = 'mmx';
+ $sse2_x86_64 = 'sse2';
+ $ssse3_x86_64 = 'ssse3';
+ $avx_x86_64 = 'avx';
+ $avx2_x86_64 = 'avx2';
+ $avx512_x86_64 = 'avx512';
+}
+
+#
+# Intra prediction
+#
+
+add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_4x4 neon sse2/;
+
+add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_4x4 neon sse2/;
+
+add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+
+add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_4x4 neon ssse3/;
+
+add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+
+add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2/;
+
+add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+
+add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_4x4 neon/;
+
+add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_4x4 neon/;
+
+add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_4x4 neon ssse3/;
+
+add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
+
+add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+
+add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2/;
+
+add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/;
+
+add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_4x4 msa neon sse2/;
+
+add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_4x4 msa neon sse2/;
+
+add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;
+
+add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_8x8 neon ssse3/;
+
+add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_d45_predictor_8x8 neon sse2/;
+
+add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_d63_predictor_8x8 neon ssse3/;
+
+add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
+
+add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_8x8 neon/;
+
+add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_8x8 neon/;
+
+add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_8x8 neon ssse3/;
+
+add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
+
+add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2/;
+
+add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 lsx/;
+
+add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
+
+add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_8x8 neon msa sse2/;
+
+add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;
+
+add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_16x16 neon ssse3/;
+
+add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
+
+add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_16x16 neon ssse3 vsx/;
+
+add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
+
+add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_16x16 neon/;
+
+add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_16x16 neon/;
+
+add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_16x16 neon ssse3/;
+
+add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;
+
+add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;
+
+add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx lsx/;
+
+add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;
+
+add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/;
+
+add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;
+
+add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_32x32 neon ssse3/;
+
+add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;
+
+add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_32x32 neon ssse3 vsx/;
+
+add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
+
+add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_32x32 neon/;
+
+add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_32x32 neon/;
+
+add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_32x32 neon ssse3/;
+
+add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;
+
+add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/;
+
+add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/;
+
+add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/;
+
+add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/;
+
+add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;
+
+# High bitdepth functions
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d207_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d63_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d117_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d153_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
+
+ add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d207_predictor_8x8 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d63_predictor_8x8 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
+
+ add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d117_predictor_8x8 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d153_predictor_8x8 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
+
+ add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
+
+ add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d207_predictor_16x16 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d63_predictor_16x16 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
+
+ add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d117_predictor_16x16 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d153_predictor_16x16 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
+
+ add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
+
+ add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d207_predictor_32x32 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d63_predictor_32x32 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
+
+ add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d117_predictor_32x32 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d153_predictor_32x32 neon ssse3/;
+
+ add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
+
+ add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/;
+
+ add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/;
+} # CONFIG_VP9_HIGHBITDEPTH
+
+if (vpx_config("CONFIG_VP9") eq "yes") {
+#
+# Sub Pixel Filters
+#
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx lsx/;
+
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/;
+
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+
+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+
+add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_2d ssse3 neon msa/;
+
+add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+
+add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+
+add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+
+add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+
+add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+} #CONFIG_VP9
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ #
+ # Sub Pixel Filters
+ #
+ add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+ specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
+
+ add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+ specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
+
+ add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+ specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+ specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+ specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+ specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+ specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+ specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
+} # CONFIG_VP9_HIGHBITDEPTH
+
+if (vpx_config("CONFIG_VP9") eq "yes") {
+#
+# Loopfilter
+#
+add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16 sse2 neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa lsx/;
+
+add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa lsx/;
+
+add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa lsx/;
+
+add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa lsx/;
+
+add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa lsx/;
+
+add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_horizontal_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa lsx/;
+
+add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa lsx/;
+
+add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa lsx/;
+
+add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa lsx/;
+
+add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa lsx/;
+} #CONFIG_VP9
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/vpx_highbd_lpf_vertical_16 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/vpx_highbd_lpf_vertical_16_dual sse2 neon/;
+
+ add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/vpx_highbd_lpf_vertical_8 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/vpx_highbd_lpf_vertical_8_dual sse2 neon/;
+
+ add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/vpx_highbd_lpf_vertical_4 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/vpx_highbd_lpf_vertical_4_dual sse2 neon/;
+
+ add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/vpx_highbd_lpf_horizontal_16 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_lpf_horizontal_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/vpx_highbd_lpf_horizontal_16_dual sse2 neon/;
+
+ add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/vpx_highbd_lpf_horizontal_8 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2 neon/;
+
+ add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/vpx_highbd_lpf_horizontal_4 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2 neon/;
+} # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Encoder functions.
+#
+
+#
+# Forward transform
+#
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct4x4 neon sse2/;
+
+ add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct4x4_1 sse2 neon/;
+ specialize qw/vpx_highbd_fdct4x4_1 neon/;
+ $vpx_highbd_fdct4x4_1_neon=vpx_fdct4x4_1_neon;
+
+ add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct8x8 neon sse2/;
+
+ add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct8x8_1 neon sse2 msa/;
+
+ add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct16x16 neon sse2/;
+
+ add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct16x16_1 sse2 neon/;
+
+ add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct32x32 neon sse2/;
+
+ add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct32x32_rd neon sse2/;
+
+ add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct32x32_1 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct4x4 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct8x8 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct8x8_1 neon/;
+ $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon;
+
+ add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct16x16 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct16x16_1 neon/;
+
+ add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct32x32 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct32x32_rd sse2 neon/;
+
+ add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct32x32_1 neon/;
+} else {
+ add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
+
+ add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct4x4_1 sse2 neon/;
+
+ add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct8x8 sse2 neon msa lsx/, "$ssse3_x86_64";
+
+ add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
+
+ add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct16x16 neon sse2 avx2 msa lsx/;
+
+ add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
+
+ add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct32x32 neon sse2 avx2 msa lsx/;
+
+ add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx lsx/;
+
+ add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_fdct32x32_1 sse2 neon msa/;
+} # CONFIG_VP9_HIGHBITDEPTH
+} # CONFIG_VP9_ENCODER
+
+#
+# Inverse transform
+if (vpx_config("CONFIG_VP9") eq "yes") {
+
+add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+
+if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+ # Note that there are more specializations appended when
+ # CONFIG_VP9_HIGHBITDEPTH is off.
+ specialize qw/vpx_idct4x4_16_add neon sse2 vsx/;
+ specialize qw/vpx_idct4x4_1_add neon sse2/;
+ specialize qw/vpx_idct8x8_64_add neon sse2 vsx/;
+ specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
+ specialize qw/vpx_idct8x8_1_add neon sse2/;
+ specialize qw/vpx_idct16x16_256_add neon sse2 avx2 vsx/;
+ specialize qw/vpx_idct16x16_38_add neon sse2/;
+ specialize qw/vpx_idct16x16_10_add neon sse2/;
+ specialize qw/vpx_idct16x16_1_add neon sse2/;
+ specialize qw/vpx_idct32x32_1024_add neon sse2 avx2 vsx/;
+ specialize qw/vpx_idct32x32_135_add neon sse2 ssse3 avx2/;
+ specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
+ specialize qw/vpx_idct32x32_1_add neon sse2/;
+ specialize qw/vpx_iwht4x4_16_add sse2 vsx/;
+
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+ # Note that these specializations are appended to the above ones.
+ specialize qw/vpx_idct4x4_16_add dspr2 msa/;
+ specialize qw/vpx_idct4x4_1_add dspr2 msa/;
+ specialize qw/vpx_idct8x8_64_add dspr2 msa/;
+ specialize qw/vpx_idct8x8_12_add dspr2 msa/;
+ specialize qw/vpx_idct8x8_1_add dspr2 msa/;
+ specialize qw/vpx_idct16x16_256_add dspr2 msa/;
+ specialize qw/vpx_idct16x16_38_add dspr2 msa/;
+ $vpx_idct16x16_38_add_dspr2=vpx_idct16x16_256_add_dspr2;
+ $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa;
+ specialize qw/vpx_idct16x16_10_add dspr2 msa/;
+ specialize qw/vpx_idct16x16_1_add dspr2 msa/;
+ specialize qw/vpx_idct32x32_1024_add dspr2 msa lsx/;
+ specialize qw/vpx_idct32x32_135_add dspr2 msa/;
+ $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
+ $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
+ $vpx_idct32x32_135_add_lsx=vpx_idct32x32_1024_add_lsx;
+ specialize qw/vpx_idct32x32_34_add dspr2 msa lsx/;
+ specialize qw/vpx_idct32x32_1_add dspr2 msa lsx/;
+ specialize qw/vpx_iwht4x4_16_add msa/;
+ specialize qw/vpx_iwht4x4_1_add msa/;
+ } # !CONFIG_VP9_HIGHBITDEPTH
+} # !CONFIG_EMULATE_HARDWARE
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ # Note as optimized versions of these functions are added we need to add a check to ensure
+ # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+
+ add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct4x4_1_add neon sse2/;
+
+ add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct8x8_1_add neon sse2/;
+
+ add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct16x16_1_add neon sse2/;
+
+ add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;
+
+ add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+
+ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+ specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct8x8_64_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct16x16_256_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct16x16_38_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct16x16_10_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct32x32_1024_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct32x32_135_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct32x32_34_add neon sse2 sse4_1/;
+ } # !CONFIG_EMULATE_HARDWARE
+} # CONFIG_VP9_HIGHBITDEPTH
+} # CONFIG_VP9
+
+#
+# Quantization
+#
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
+ add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
+
+ add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *scan_order";
+ specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
+
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *scan_order";
+ specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
+ } # CONFIG_VP9_HIGHBITDEPTH
+} # CONFIG_VP9_ENCODER
+
+if (vpx_config("CONFIG_ENCODERS") eq "yes") {
+#
+# Block subtraction
+#
+add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/;
+
+#
+# Single block SAD
+#
+add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi lsx/;
+
+add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi lsx/;
+
+add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi lsx/;
+
+add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi lsx/;
+
+add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x4 neon msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x8 neon msa sse2 mmi/;
+
+add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
+
+add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x64 neon avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x32 neon avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x64 neon avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x32 neon avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x16 neon avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x32 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x16 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x8 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x16 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x8 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x4 neon/;
+
+add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x8 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x4 neon/;
+
+#
+# Avg
+#
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
+ add_proto qw/unsigned int vpx_avg_8x8/, "const uint8_t *, int p";
+ specialize qw/vpx_avg_8x8 sse2 neon msa/;
+
+ add_proto qw/unsigned int vpx_avg_4x4/, "const uint8_t *, int p";
+ specialize qw/vpx_avg_4x4 sse2 neon msa/;
+
+ add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+ specialize qw/vpx_minmax_8x8 sse2 neon msa/;
+
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/vpx_hadamard_8x8 sse2 neon vsx lsx/, "$ssse3_x86_64";
+
+ add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/;
+
+ add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
+
+ add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/vpx_highbd_hadamard_8x8 avx2 neon/;
+
+ add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/vpx_highbd_hadamard_16x16 avx2 neon/;
+
+ add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/vpx_highbd_hadamard_32x32 avx2 neon/;
+
+ add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
+ specialize qw/vpx_satd avx2 sse2 neon/;
+
+ add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length";
+ specialize qw/vpx_highbd_satd avx2 neon/;
+ } else {
+ add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+ specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64";
+
+ add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+ specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/;
+
+ add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+ specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
+
+ add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
+ specialize qw/vpx_satd avx2 sse2 neon msa/;
+ }
+
+ add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
+ specialize qw/vpx_int_pro_row sse2 neon msa/;
+
+ add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
+ specialize qw/vpx_int_pro_col sse2 neon msa/;
+
+ add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
+ specialize qw/vpx_vector_var neon sse2 msa/;
+} # CONFIG_VP9_ENCODER
+
+add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi lsx/;
+
+add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi lsx/;
+
+add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x32_avg neon msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x16_avg neon msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x8_avg neon msa sse2 vsx mmi/;
+
+add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/;
+
+add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x8_avg neon msa sse2 mmi/;
+
+add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x4_avg neon msa sse2 mmi/;
+
+add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad4x8_avg neon msa sse2 mmi/;
+
+add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/;
+
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi lsx/;
+
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi lsx/;
+
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/;
+
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
+
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
+
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi lsx/;
+
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
+
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
+
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad8x8x4d neon msa sse2 mmi lsx/;
+
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
+
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
+
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
+
+add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x64x4d neon avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x32x4d neon avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x64x4d neon avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x32x4d neon avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x16x4d neon avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x32x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x16x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x8x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x16x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x8x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x4x4d neon/;
+
+add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x8x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x4x4d neon/;
+
+add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
+specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
+
+#
+# Structured Similarity (SSIM)
+#
+if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+ add_proto qw/void vpx_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64";
+}
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ #
+ # Block subtraction
+ #
+ add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
+ specialize qw/vpx_highbd_subtract_block neon avx2/;
+
+ #
+ # Single block SAD
+ #
+ add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad64x64 sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad64x32 sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad32x32 sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad32x16 sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad8x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad8x8 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad8x4 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad4x8 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad4x4 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_64x64 neon sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_64x32 neon sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_32x64 neon sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_32x32 neon sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_32x16 neon sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_16x32 neon sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_16x16 neon sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_16x8 neon sse2 avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_8x16 neon sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_8x8 neon sse2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_8x4 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_4x8 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad_skip_4x4 neon/;
+
+ #
+ # Avg
+ #
+ add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p";
+ specialize qw/vpx_highbd_avg_8x8 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p";
+ specialize qw/vpx_highbd_avg_4x4 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
+ specialize qw/vpx_highbd_minmax_8x8 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad64x32_avg sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad32x32_avg sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad32x16_avg sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad16x16_avg sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad16x8_avg sse2 neon avx2/;
+
+ add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad8x16_avg sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad8x8_avg sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad8x4_avg sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad4x8_avg neon/;
+
+ add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad4x4_avg neon/;
+
+ #
+ # Multi-block SAD, comparing a reference to N independent blocks
+ #
+ add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/;
+
+ add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/;
+
+ add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/;
+
+ add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/;
+
+ add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/;
+
+ add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/;
+
+ add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/;
+
+ add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/;
+
+ add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad8x16x4d sse2 neon/;
+
+ add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad8x8x4d sse2 neon/;
+
+ add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad8x4x4d sse2 neon/;
+
+ add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad4x8x4d sse2 neon/;
+
+ add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
+
+ add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_64x64x4d neon sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_64x32x4d neon sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_32x64x4d neon sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_32x32x4d neon sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_32x16x4d neon sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_16x32x4d neon sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_16x16x4d neon sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_16x8x4d neon sse2 avx2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_8x16x4d neon sse2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_8x8x4d neon sse2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_8x4x4d neon/;
+
+ add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_4x8x4d neon sse2/;
+
+ add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+ specialize qw/vpx_highbd_sad_skip_4x4x4d neon/;
+
+ #
+ # Structured Similarity (SSIM)
+ #
+ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+ add_proto qw/void vpx_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ }
+} # CONFIG_VP9_HIGHBITDEPTH
+} # CONFIG_ENCODERS
+
+if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+
+#
+# Variance
+#
+add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx lsx/;
+
+add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/;
+
+add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/;
+
+add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx lsx/;
+
+add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/;
+
+add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/;
+
+add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx lsx/;
+
+add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/;
+
+add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance8x16 sse2 avx2 neon msa mmi vsx/;
+
+add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance8x8 sse2 avx2 neon msa mmi vsx lsx/;
+
+add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance8x4 sse2 avx2 neon msa mmi vsx/;
+
+add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/;
+
+add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_variance4x4 sse2 neon msa mmi vsx/;
+
+#
+# Specialty Variance
+#
+add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx lsx/;
+
+add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vpx_get8x8var sse2 neon msa vsx/;
+
+add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/;
+
+add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_mse16x8 sse2 avx2 neon msa mmi vsx/;
+
+add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_mse8x16 sse2 neon msa mmi vsx/;
+
+add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_mse8x8 sse2 neon msa mmi vsx/;
+
+add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
+ specialize qw/vpx_get_mb_ss sse2 msa vsx/;
+
+add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride";
+ specialize qw/vpx_get4x4sse_cs neon msa vsx/;
+
+add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+ specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/;
+
+#
+# Subpixel Variance
+#
+add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance64x32 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3 lsx/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3 lsx/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3 lsx/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance4x8 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3 lsx/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa mmi sse2 ssse3/;
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa mmi sse2 ssse3/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance64x64 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance64x32 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance32x64 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance32x32 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance32x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance16x32 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance16x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance16x8 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance8x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance8x8 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance8x4 neon/;
+ add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance4x8 neon/;
+ add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance4x4 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance64x64 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance64x32 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance32x64 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance32x32 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance32x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance16x32 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance16x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance16x8 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance8x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance8x8 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance8x4 neon/;
+ add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance4x8 neon/;
+ add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance4x4 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance64x64 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance64x32 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance32x64 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance32x32 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance32x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance16x32 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance16x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance16x8 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance8x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance8x8 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance8x4 neon/;
+ add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance4x8 neon/;
+ add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance4x4 neon/;
+
+ add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vpx_highbd_8_get16x16var sse2 neon/;
+
+ add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vpx_highbd_8_get8x8var sse2 neon/;
+
+ add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vpx_highbd_10_get16x16var sse2 neon/;
+
+ add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vpx_highbd_10_get8x8var sse2 neon/;
+
+ add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vpx_highbd_12_get16x16var sse2 neon/;
+
+ add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_mse16x8 neon/;
+ add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_mse8x16 neon/;
+ add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_mse16x8 neon/;
+ add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_mse8x16 neon/;
+ add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_mse8x8 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_mse16x16 sse2 neon/;
+
+ add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_mse16x8 neon/;
+ add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_mse8x16 neon/;
+ add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
+
+ add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
+ specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
+
+ #
+ # Subpixel Variance
+ #
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/;
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/;
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/;
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/;
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/;
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/;
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/;
+
+} # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Post Processing
+#
+if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+ add_proto qw/void vpx_plane_add_noise/, "uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch";
+ specialize qw/vpx_plane_add_noise sse2 msa/;
+
+ add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+ specialize qw/vpx_mbpost_proc_down sse2 neon msa vsx/;
+
+ add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *src, int pitch, int rows, int cols,int flimit";
+ specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/;
+
+ add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
+ specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa vsx/;
+
+}
+
+} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
+1;
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
new file mode 100644
index 0000000000..54357ee6ca
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_VPX_FILTER_H_
+#define VPX_VPX_DSP_VPX_FILTER_H_
+
+#include <assert.h>
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+static INLINE int vpx_get_filter_taps(const int16_t *const filter) {
+ assert(filter[3] != 128);
+ if (!filter[0] && !filter[1] && !filter[2])
+ return 2;
+ else
+ return 8;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_VPX_FILTER_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm
new file mode 100644
index 0000000000..f51718cf99
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm
@@ -0,0 +1,88 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise,
+; int blackclamp, int whiteclamp,
+; int width, int height, int pitch)
+globalsym(vpx_plane_add_noise_sse2)
+sym(vpx_plane_add_noise_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+
+ mov rdx, 0x01010101
+ mov rax, arg(2)
+ mul rdx
+ movq xmm3, rax
+ pshufd xmm3, xmm3, 0 ; xmm3 is 16 copies of char in blackclamp
+
+ mov rdx, 0x01010101
+ mov rax, arg(3)
+ mul rdx
+ movq xmm4, rax
+ pshufd xmm4, xmm4, 0 ; xmm4 is 16 copies of char in whiteclamp
+
+ movdqu xmm5, xmm3 ; both clamp = black clamp + white clamp
+ paddusb xmm5, xmm4
+
+.addnoise_loop:
+ call sym(LIBVPX_RAND) WRT_PLT
+ mov rcx, arg(1) ;noise
+ and rax, 0xff
+ add rcx, rax
+
+ mov rdi, rcx
+ movsxd rcx, dword arg(4) ;[Width]
+ mov rsi, arg(0) ;Pos
+ xor rax, rax
+
+.addnoise_nextset:
+ movdqu xmm1,[rsi+rax] ; get the source
+
+ psubusb xmm1, xmm3 ; subtract black clamp
+ paddusb xmm1, xmm5 ; add both clamp
+ psubusb xmm1, xmm4 ; subtract whiteclamp
+
+ movdqu xmm2,[rdi+rax] ; get the noise for this line
+ paddb xmm1,xmm2 ; add it in
+ movdqu [rsi+rax],xmm1 ; store the result
+
+ add rax,16 ; move to the next line
+
+ cmp rax, rcx
+ jl .addnoise_nextset
+
+ movsxd rax, dword arg(6) ; Pitch
+ add arg(0), rax ; Start += Pitch
+ sub dword arg(5), 1 ; Height -= 1
+ jg .addnoise_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+rd42:
+ times 8 dw 0x04
+four8s:
+ times 4 dd 8
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 0000000000..b2e01319d3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_ports/mem.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+ __m256i a0 = in[0];
+ __m256i a1 = in[1];
+ __m256i a2 = in[2];
+ __m256i a3 = in[3];
+ __m256i a4 = in[4];
+ __m256i a5 = in[5];
+ __m256i a6 = in[6];
+ __m256i a7 = in[7];
+
+ __m256i b0 = _mm256_add_epi32(a0, a1);
+ __m256i b1 = _mm256_sub_epi32(a0, a1);
+ __m256i b2 = _mm256_add_epi32(a2, a3);
+ __m256i b3 = _mm256_sub_epi32(a2, a3);
+ __m256i b4 = _mm256_add_epi32(a4, a5);
+ __m256i b5 = _mm256_sub_epi32(a4, a5);
+ __m256i b6 = _mm256_add_epi32(a6, a7);
+ __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+ a0 = _mm256_add_epi32(b0, b2);
+ a1 = _mm256_add_epi32(b1, b3);
+ a2 = _mm256_sub_epi32(b0, b2);
+ a3 = _mm256_sub_epi32(b1, b3);
+ a4 = _mm256_add_epi32(b4, b6);
+ a5 = _mm256_add_epi32(b5, b7);
+ a6 = _mm256_sub_epi32(b4, b6);
+ a7 = _mm256_sub_epi32(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm256_add_epi32(a0, a4);
+ b7 = _mm256_add_epi32(a1, a5);
+ b3 = _mm256_add_epi32(a2, a6);
+ b4 = _mm256_add_epi32(a3, a7);
+ b2 = _mm256_sub_epi32(a0, a4);
+ b6 = _mm256_sub_epi32(a1, a5);
+ b1 = _mm256_sub_epi32(a2, a6);
+ b5 = _mm256_sub_epi32(a3, a7);
+
+ a0 = _mm256_unpacklo_epi32(b0, b1);
+ a1 = _mm256_unpacklo_epi32(b2, b3);
+ a2 = _mm256_unpackhi_epi32(b0, b1);
+ a3 = _mm256_unpackhi_epi32(b2, b3);
+ a4 = _mm256_unpacklo_epi32(b4, b5);
+ a5 = _mm256_unpacklo_epi32(b6, b7);
+ a6 = _mm256_unpackhi_epi32(b4, b5);
+ a7 = _mm256_unpackhi_epi32(b6, b7);
+
+ b0 = _mm256_unpacklo_epi64(a0, a1);
+ b1 = _mm256_unpacklo_epi64(a4, a5);
+ b2 = _mm256_unpackhi_epi64(a0, a1);
+ b3 = _mm256_unpackhi_epi64(a4, a5);
+ b4 = _mm256_unpacklo_epi64(a2, a3);
+ b5 = _mm256_unpacklo_epi64(a6, a7);
+ b6 = _mm256_unpackhi_epi64(a2, a3);
+ b7 = _mm256_unpackhi_epi64(a6, a7);
+
+ in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+ in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+ in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+ in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+ in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+ in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+ in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+ in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+ } else {
+ in[0] = _mm256_add_epi32(a0, a4);
+ in[7] = _mm256_add_epi32(a1, a5);
+ in[3] = _mm256_add_epi32(a2, a6);
+ in[4] = _mm256_add_epi32(a3, a7);
+ in[2] = _mm256_sub_epi32(a0, a4);
+ in[6] = _mm256_sub_epi32(a1, a5);
+ in[1] = _mm256_sub_epi32(a2, a6);
+ in[5] = _mm256_sub_epi32(a3, a7);
+ }
+}
+
+void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ __m128i src16[8];
+ __m256i src32[8];
+
+ src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+ src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
+
+ src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+ src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+ src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+ src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+ src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+ src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+ src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+ src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+ highbd_hadamard_col8_avx2(src32, 0);
+ highbd_hadamard_col8_avx2(src32, 1);
+
+ _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 1);
+ b1 = _mm256_srai_epi32(b1, 1);
+ b2 = _mm256_srai_epi32(b2, 1);
+ b3 = _mm256_srai_epi32(b3, 1);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+
+void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+ }
+
+ for (idx = 0; idx < 256; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 2);
+ b1 = _mm256_srai_epi32(b1, 2);
+ b2 = _mm256_srai_epi32(b2, 2);
+ b3 = _mm256_srai_epi32(b3, 2);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+ __m256i a0 = in[0];
+ __m256i a1 = in[1];
+ __m256i a2 = in[2];
+ __m256i a3 = in[3];
+ __m256i a4 = in[4];
+ __m256i a5 = in[5];
+ __m256i a6 = in[6];
+ __m256i a7 = in[7];
+
+ __m256i b0 = _mm256_add_epi16(a0, a1);
+ __m256i b1 = _mm256_sub_epi16(a0, a1);
+ __m256i b2 = _mm256_add_epi16(a2, a3);
+ __m256i b3 = _mm256_sub_epi16(a2, a3);
+ __m256i b4 = _mm256_add_epi16(a4, a5);
+ __m256i b5 = _mm256_sub_epi16(a4, a5);
+ __m256i b6 = _mm256_add_epi16(a6, a7);
+ __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+ a0 = _mm256_add_epi16(b0, b2);
+ a1 = _mm256_add_epi16(b1, b3);
+ a2 = _mm256_sub_epi16(b0, b2);
+ a3 = _mm256_sub_epi16(b1, b3);
+ a4 = _mm256_add_epi16(b4, b6);
+ a5 = _mm256_add_epi16(b5, b7);
+ a6 = _mm256_sub_epi16(b4, b6);
+ a7 = _mm256_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm256_add_epi16(a0, a4);
+ b7 = _mm256_add_epi16(a1, a5);
+ b3 = _mm256_add_epi16(a2, a6);
+ b4 = _mm256_add_epi16(a3, a7);
+ b2 = _mm256_sub_epi16(a0, a4);
+ b6 = _mm256_sub_epi16(a1, a5);
+ b1 = _mm256_sub_epi16(a2, a6);
+ b5 = _mm256_sub_epi16(a3, a7);
+
+ a0 = _mm256_unpacklo_epi16(b0, b1);
+ a1 = _mm256_unpacklo_epi16(b2, b3);
+ a2 = _mm256_unpackhi_epi16(b0, b1);
+ a3 = _mm256_unpackhi_epi16(b2, b3);
+ a4 = _mm256_unpacklo_epi16(b4, b5);
+ a5 = _mm256_unpacklo_epi16(b6, b7);
+ a6 = _mm256_unpackhi_epi16(b4, b5);
+ a7 = _mm256_unpackhi_epi16(b6, b7);
+
+ b0 = _mm256_unpacklo_epi32(a0, a1);
+ b1 = _mm256_unpacklo_epi32(a4, a5);
+ b2 = _mm256_unpackhi_epi32(a0, a1);
+ b3 = _mm256_unpackhi_epi32(a4, a5);
+ b4 = _mm256_unpacklo_epi32(a2, a3);
+ b5 = _mm256_unpacklo_epi32(a6, a7);
+ b6 = _mm256_unpackhi_epi32(a2, a3);
+ b7 = _mm256_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm256_unpacklo_epi64(b0, b1);
+ in[1] = _mm256_unpackhi_epi64(b0, b1);
+ in[2] = _mm256_unpacklo_epi64(b2, b3);
+ in[3] = _mm256_unpackhi_epi64(b2, b3);
+ in[4] = _mm256_unpacklo_epi64(b4, b5);
+ in[5] = _mm256_unpackhi_epi64(b4, b5);
+ in[6] = _mm256_unpacklo_epi64(b6, b7);
+ in[7] = _mm256_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm256_add_epi16(a0, a4);
+ in[7] = _mm256_add_epi16(a1, a5);
+ in[3] = _mm256_add_epi16(a2, a6);
+ in[4] = _mm256_add_epi16(a3, a7);
+ in[2] = _mm256_sub_epi16(a0, a4);
+ in[6] = _mm256_sub_epi16(a1, a5);
+ in[1] = _mm256_sub_epi16(a2, a6);
+ in[5] = _mm256_sub_epi16(a3, a7);
+ }
+}
+
+static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ __m256i src[8];
+ src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+ src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
+
+ hadamard_col8x2_avx2(src, 0);
+ hadamard_col8x2_avx2(src, 1);
+
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+ int16_t *t_coeff = temp_coeff;
+#else
+ int16_t *t_coeff = coeff;
+#endif
+ int16_t *coeff16 = (int16_t *)coeff;
+ int idx;
+ for (idx = 0; idx < 2; ++idx) {
+ const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+ hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+ }
+
+ for (idx = 0; idx < 64; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi16(b0, 1);
+ b1 = _mm256_srai_epi16(b1, 1);
+ b2 = _mm256_srai_epi16(b2, 1);
+ b3 = _mm256_srai_epi16(b3, 1);
+ if (is_final) {
+ store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+ store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+ store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+ store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+ coeff += 16;
+ } else {
+ _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
+ coeff16 += 16;
+ }
+ t_coeff += 16;
+ }
+}
+
+void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
+}
+
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+ int16_t *t_coeff = temp_coeff;
+#else
+ int16_t *t_coeff = coeff;
+#endif
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ hadamard_16x16_avx2(src_ptr, src_stride,
+ (tran_low_t *)(t_coeff + idx * 256), 0);
+ }
+
+ for (idx = 0; idx < 256; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+ __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi16(b0, 2);
+ b1 = _mm256_srai_epi16(b1, 2);
+ b2 = _mm256_srai_epi16(b2, 2);
+ b3 = _mm256_srai_epi16(b3, 2);
+
+ store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+ store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
+ store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
+ store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
+
+ coeff += 16;
+ t_coeff += 16;
+ }
+}
+
+int vpx_satd_avx2(const tran_low_t *coeff, int length) {
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i accum = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < length; i += 16) {
+ const __m256i src_line = load_tran_low(coeff);
+ const __m256i abs = _mm256_abs_epi16(src_line);
+ const __m256i sum = _mm256_madd_epi16(abs, one);
+ accum = _mm256_add_epi32(accum, sum);
+ coeff += 16;
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) {
+ __m256i accum = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < length; i += 8, coeff += 8) {
+ const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i abs = _mm256_abs_epi32(src_line);
+ accum = _mm256_add_epi32(accum, abs);
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
new file mode 100644
index 0000000000..015c11a1f3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_ports/mem.h"
+
+void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+ u0 = _mm_setzero_si128();
+ // Row 0
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff0 = _mm_max_epi16(diff, negdiff);
+ // Row 1
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+ minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+ // Row 2
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 3
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 4
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 5
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 6
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 7
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+ *max = _mm_extract_epi16(maxabsdiff, 0);
+
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+ *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
+ __m128i s0, s1, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+ avg = _mm_extract_epi16(s0, 0);
+ return (avg + 32) >> 6;
+}
+
+unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
+ __m128i s0, s1, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+ avg = _mm_extract_epi16(s0, 0);
+ return (avg + 8) >> 4;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) {
+ __m128i s0, s1;
+ unsigned int avg;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ const __m128i zero = _mm_setzero_si128();
+ s0 = _mm_loadu_si128((const __m128i *)(s));
+ s1 = _mm_loadu_si128((const __m128i *)(s + p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 2 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 3 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 4 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 5 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 6 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 7 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpackhi_epi16(s0, zero);
+ s0 = _mm_unpacklo_epi16(s0, zero);
+ s0 = _mm_add_epi32(s0, s1);
+ s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8));
+ s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4));
+ avg = (unsigned int)_mm_cvtsi128_si32(s0);
+
+ return (avg + 32) >> 6;
+}
+
+unsigned int vpx_highbd_avg_4x4_sse2(const uint8_t *s8, int p) {
+ __m128i s0, s1;
+ unsigned int avg;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ s0 = _mm_loadl_epi64((const __m128i *)(s));
+ s1 = _mm_loadl_epi64((const __m128i *)(s + p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadl_epi64((const __m128i *)(s + 2 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadl_epi64((const __m128i *)(s + 3 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 4));
+ s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 2));
+ avg = _mm_extract_epi16(s0, 0);
+
+ return (avg + 8) >> 4;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static void hadamard_col8_sse2(__m128i *in, int iter) {
+ __m128i a0 = in[0];
+ __m128i a1 = in[1];
+ __m128i a2 = in[2];
+ __m128i a3 = in[3];
+ __m128i a4 = in[4];
+ __m128i a5 = in[5];
+ __m128i a6 = in[6];
+ __m128i a7 = in[7];
+
+ __m128i b0 = _mm_add_epi16(a0, a1);
+ __m128i b1 = _mm_sub_epi16(a0, a1);
+ __m128i b2 = _mm_add_epi16(a2, a3);
+ __m128i b3 = _mm_sub_epi16(a2, a3);
+ __m128i b4 = _mm_add_epi16(a4, a5);
+ __m128i b5 = _mm_sub_epi16(a4, a5);
+ __m128i b6 = _mm_add_epi16(a6, a7);
+ __m128i b7 = _mm_sub_epi16(a6, a7);
+
+ a0 = _mm_add_epi16(b0, b2);
+ a1 = _mm_add_epi16(b1, b3);
+ a2 = _mm_sub_epi16(b0, b2);
+ a3 = _mm_sub_epi16(b1, b3);
+ a4 = _mm_add_epi16(b4, b6);
+ a5 = _mm_add_epi16(b5, b7);
+ a6 = _mm_sub_epi16(b4, b6);
+ a7 = _mm_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm_add_epi16(a0, a4);
+ b7 = _mm_add_epi16(a1, a5);
+ b3 = _mm_add_epi16(a2, a6);
+ b4 = _mm_add_epi16(a3, a7);
+ b2 = _mm_sub_epi16(a0, a4);
+ b6 = _mm_sub_epi16(a1, a5);
+ b1 = _mm_sub_epi16(a2, a6);
+ b5 = _mm_sub_epi16(a3, a7);
+
+ a0 = _mm_unpacklo_epi16(b0, b1);
+ a1 = _mm_unpacklo_epi16(b2, b3);
+ a2 = _mm_unpackhi_epi16(b0, b1);
+ a3 = _mm_unpackhi_epi16(b2, b3);
+ a4 = _mm_unpacklo_epi16(b4, b5);
+ a5 = _mm_unpacklo_epi16(b6, b7);
+ a6 = _mm_unpackhi_epi16(b4, b5);
+ a7 = _mm_unpackhi_epi16(b6, b7);
+
+ b0 = _mm_unpacklo_epi32(a0, a1);
+ b1 = _mm_unpacklo_epi32(a4, a5);
+ b2 = _mm_unpackhi_epi32(a0, a1);
+ b3 = _mm_unpackhi_epi32(a4, a5);
+ b4 = _mm_unpacklo_epi32(a2, a3);
+ b5 = _mm_unpacklo_epi32(a6, a7);
+ b6 = _mm_unpackhi_epi32(a2, a3);
+ b7 = _mm_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm_unpacklo_epi64(b0, b1);
+ in[1] = _mm_unpackhi_epi64(b0, b1);
+ in[2] = _mm_unpacklo_epi64(b2, b3);
+ in[3] = _mm_unpackhi_epi64(b2, b3);
+ in[4] = _mm_unpacklo_epi64(b4, b5);
+ in[5] = _mm_unpackhi_epi64(b4, b5);
+ in[6] = _mm_unpacklo_epi64(b6, b7);
+ in[7] = _mm_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm_add_epi16(a0, a4);
+ in[7] = _mm_add_epi16(a1, a5);
+ in[3] = _mm_add_epi16(a2, a6);
+ in[4] = _mm_add_epi16(a3, a7);
+ in[2] = _mm_sub_epi16(a0, a4);
+ in[6] = _mm_sub_epi16(a1, a5);
+ in[1] = _mm_sub_epi16(a2, a6);
+ in[5] = _mm_sub_epi16(a3, a7);
+ }
+}
+
+static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+ __m128i src[8];
+ src[0] = _mm_load_si128((const __m128i *)src_diff);
+ src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
+
+ hadamard_col8_sse2(src, 0);
+ hadamard_col8_sse2(src, 1);
+
+ if (is_final) {
+ store_tran_low(src[0], coeff);
+ coeff += 8;
+ store_tran_low(src[1], coeff);
+ coeff += 8;
+ store_tran_low(src[2], coeff);
+ coeff += 8;
+ store_tran_low(src[3], coeff);
+ coeff += 8;
+ store_tran_low(src[4], coeff);
+ coeff += 8;
+ store_tran_low(src[5], coeff);
+ coeff += 8;
+ store_tran_low(src[6], coeff);
+ coeff += 8;
+ store_tran_low(src[7], coeff);
+ } else {
+ int16_t *coeff16 = (int16_t *)coeff;
+ _mm_store_si128((__m128i *)coeff16, src[0]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[1]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[2]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[3]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[4]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[5]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[6]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[7]);
+ }
+}
+
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
+}
+
+static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+ int16_t *t_coeff = temp_coeff;
+#else
+ int16_t *t_coeff = coeff;
+#endif
+ int16_t *coeff16 = (int16_t *)coeff;
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
+ 0);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm_srai_epi16(b0, 1);
+ b1 = _mm_srai_epi16(b1, 1);
+ b2 = _mm_srai_epi16(b2, 1);
+ b3 = _mm_srai_epi16(b3, 1);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+
+ if (is_final) {
+ store_tran_low(coeff0, coeff);
+ store_tran_low(coeff1, coeff + 64);
+ store_tran_low(coeff2, coeff + 128);
+ store_tran_low(coeff3, coeff + 192);
+ coeff += 8;
+ } else {
+ _mm_store_si128((__m128i *)coeff16, coeff0);
+ _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
+ _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
+ _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
+ coeff16 += 8;
+ }
+
+ t_coeff += 8;
+ }
+}
+
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
+}
+
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+ int16_t *t_coeff = temp_coeff;
+#else
+ int16_t *t_coeff = coeff;
+#endif
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ hadamard_16x16_sse2(src_ptr, src_stride,
+ (tran_low_t *)(t_coeff + idx * 256), 0);
+ }
+
+ for (idx = 0; idx < 256; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm_srai_epi16(b0, 2);
+ b1 = _mm_srai_epi16(b1, 2);
+ b2 = _mm_srai_epi16(b2, 2);
+ b3 = _mm_srai_epi16(b3, 2);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ store_tran_low(coeff0, coeff);
+ store_tran_low(coeff1, coeff + 256);
+
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+ store_tran_low(coeff2, coeff + 512);
+ store_tran_low(coeff3, coeff + 768);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+
+int vpx_satd_sse2(const tran_low_t *coeff, int length) {
+ int i;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i accum = zero;
+
+ for (i = 0; i < length; i += 8) {
+ const __m128i src_line = load_tran_low(coeff);
+ const __m128i inv = _mm_sub_epi16(zero, src_line);
+ const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
+ const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
+ const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
+ const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
+ accum = _mm_add_epi32(accum, sum);
+ coeff += 8;
+ }
+
+ { // cascading summation of accum
+ __m128i hi = _mm_srli_si128(accum, 8);
+ accum = _mm_add_epi32(accum, hi);
+ hi = _mm_srli_epi64(accum, 32);
+ accum = _mm_add_epi32(accum, hi);
+ }
+
+ return _mm_cvtsi128_si32(accum);
+}
+
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
+ const int ref_stride, const int height) {
+ int idx;
+ __m128i zero = _mm_setzero_si128();
+ __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+ __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+ __m128i t0, t1;
+ int height_1 = height - 1;
+ ref += ref_stride;
+
+ for (idx = 1; idx < height_1; idx += 2) {
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref += ref_stride;
+
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref += ref_stride;
+ }
+
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+
+ if (height == 64) {
+ s0 = _mm_srai_epi16(s0, 5);
+ s1 = _mm_srai_epi16(s1, 5);
+ } else if (height == 32) {
+ s0 = _mm_srai_epi16(s0, 4);
+ s1 = _mm_srai_epi16(s1, 4);
+ } else {
+ s0 = _mm_srai_epi16(s0, 3);
+ s1 = _mm_srai_epi16(s1, 3);
+ }
+
+ _mm_storeu_si128((__m128i *)hbuf, s0);
+ hbuf += 8;
+ _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s0 = _mm_sad_epu8(src_line, zero);
+ __m128i s1;
+ int i;
+
+ for (i = 16; i < width; i += 16) {
+ ref += 16;
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ s1 = _mm_sad_epu8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, s1);
+ }
+
+ s1 = _mm_srli_si128(s0, 8);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ return _mm_extract_epi16(s0, 0);
+}
+
+int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl) {
+ int idx;
+ int width = 4 << bwl;
+ int16_t mean;
+ __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
+ __m128i v1 = _mm_load_si128((const __m128i *)src);
+ __m128i diff = _mm_subs_epi16(v0, v1);
+ __m128i sum = diff;
+ __m128i sse = _mm_madd_epi16(diff, diff);
+
+ ref += 8;
+ src += 8;
+
+ for (idx = 8; idx < width; idx += 8) {
+ v0 = _mm_loadu_si128((const __m128i *)ref);
+ v1 = _mm_load_si128((const __m128i *)src);
+ diff = _mm_subs_epi16(v0, v1);
+
+ sum = _mm_add_epi16(sum, diff);
+ v0 = _mm_madd_epi16(diff, diff);
+ sse = _mm_add_epi32(sse, v0);
+
+ ref += 8;
+ src += 8;
+ }
+
+ v0 = _mm_srli_si128(sum, 8);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi64(sum, 32);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi32(sum, 16);
+ sum = _mm_add_epi16(sum, v0);
+
+ v1 = _mm_srli_si128(sse, 8);
+ sse = _mm_add_epi32(sse, v1);
+ v1 = _mm_srli_epi64(sse, 32);
+ sse = _mm_add_epi32(sse, v1);
+
+ mean = (int16_t)_mm_extract_epi16(sum, 0);
+
+ return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c
new file mode 100644
index 0000000000..c6e70f744e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ /* comp_pred and pred must be 16 byte aligned. */
+ assert(((intptr_t)comp_pred & 0xf) == 0);
+ assert(((intptr_t)pred & 0xf) == 0);
+ if (width > 8) {
+ int x, y;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; x += 16) {
+ const __m128i p = _mm_load_si128((const __m128i *)(pred + x));
+ const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));
+ const __m128i avg = _mm_avg_epu8(p, r);
+ _mm_store_si128((__m128i *)(comp_pred + x), avg);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else { // width must be 4 or 8.
+ int i;
+ // Process 16 elements at a time. comp_pred and pred have width == stride
+ // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are
+ // all divisible by 16 so just ref needs to be massaged when loading.
+ for (i = 0; i < width * height; i += 16) {
+ const __m128i p = _mm_load_si128((const __m128i *)pred);
+ __m128i r;
+ __m128i avg;
+ if (width == ref_stride) {
+ r = _mm_loadu_si128((const __m128i *)ref);
+ ref += 16;
+ } else if (width == 4) {
+ r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride),
+ loadu_int32(ref + 2 * ref_stride),
+ loadu_int32(ref + ref_stride), loadu_int32(ref));
+
+ ref += 4 * ref_stride;
+ } else {
+ const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+ assert(width == 8);
+ r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0),
+ (const __m64 *)(ref + ref_stride)));
+
+ ref += 2 * ref_stride;
+ }
+ avg = _mm_avg_epu8(p, r);
+ _mm_store_si128((__m128i *)comp_pred, avg);
+
+ pred += 16;
+ comp_pred += 16;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
new file mode 100644
index 0000000000..9122b5a401
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -0,0 +1,130 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
+
+SECTION .text
+
+%if VPX_ARCH_X86_64
+; matrix transpose
+%macro TRANSPOSE8X8 10
+ ; stage 1
+ punpcklwd m%9, m%1, m%2
+ punpcklwd m%10, m%3, m%4
+ punpckhwd m%1, m%2
+ punpckhwd m%3, m%4
+
+ punpcklwd m%2, m%5, m%6
+ punpcklwd m%4, m%7, m%8
+ punpckhwd m%5, m%6
+ punpckhwd m%7, m%8
+
+ ; stage 2
+ punpckldq m%6, m%9, m%10
+ punpckldq m%8, m%1, m%3
+ punpckhdq m%9, m%10
+ punpckhdq m%1, m%3
+
+ punpckldq m%10, m%2, m%4
+ punpckldq m%3, m%5, m%7
+ punpckhdq m%2, m%4
+ punpckhdq m%5, m%7
+
+ ; stage 3
+ punpckhqdq m%4, m%9, m%2 ; out3
+ punpcklqdq m%9, m%2 ; out2
+ punpcklqdq m%7, m%1, m%5 ; out6
+ punpckhqdq m%1, m%5 ; out7
+
+ punpckhqdq m%2, m%6, m%10 ; out1
+ punpcklqdq m%6, m%10 ; out0
+ punpcklqdq m%5, m%8, m%3 ; out4
+ punpckhqdq m%8, m%3 ; out5
+
+ SWAP %6, %1
+ SWAP %3, %9
+ SWAP %8, %6
+%endmacro
+
+%macro HMD8_1D 0
+ psubw m8, m0, m1
+ psubw m9, m2, m3
+ paddw m0, m1
+ paddw m2, m3
+ SWAP 1, 8
+ SWAP 3, 9
+ psubw m8, m4, m5
+ psubw m9, m6, m7
+ paddw m4, m5
+ paddw m6, m7
+ SWAP 5, 8
+ SWAP 7, 9
+
+ psubw m8, m0, m2
+ psubw m9, m1, m3
+ paddw m0, m2
+ paddw m1, m3
+ SWAP 2, 8
+ SWAP 3, 9
+ psubw m8, m4, m6
+ psubw m9, m5, m7
+ paddw m4, m6
+ paddw m5, m7
+ SWAP 6, 8
+ SWAP 7, 9
+
+ psubw m8, m0, m4
+ psubw m9, m1, m5
+ paddw m0, m4
+ paddw m1, m5
+ SWAP 4, 8
+ SWAP 5, 9
+ psubw m8, m2, m6
+ psubw m9, m3, m7
+ paddw m2, m6
+ paddw m3, m7
+ SWAP 6, 8
+ SWAP 7, 9
+%endmacro
+
+
+INIT_XMM ssse3
+cglobal hadamard_8x8, 3, 5, 11, input, stride, output
+ lea r3, [2 * strideq]
+ lea r4, [4 * strideq]
+
+ mova m0, [inputq]
+ mova m1, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m2, [inputq]
+ mova m3, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m4, [inputq]
+ mova m5, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m6, [inputq]
+ mova m7, [inputq + r3]
+
+ HMD8_1D
+ TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
+ HMD8_1D
+
+ STORE_TRAN_LOW 0, outputq, 0, 8, 9
+ STORE_TRAN_LOW 1, outputq, 8, 8, 9
+ STORE_TRAN_LOW 2, outputq, 16, 8, 9
+ STORE_TRAN_LOW 3, outputq, 24, 8, 9
+ STORE_TRAN_LOW 4, outputq, 32, 8, 9
+ STORE_TRAN_LOW 5, outputq, 40, 8, 9
+ STORE_TRAN_LOW 6, outputq, 48, 8, 9
+ STORE_TRAN_LOW 7, outputq, 56, 8, 9
+
+ RET
+%endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
new file mode 100644
index 0000000000..c02b47a3eb
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 16 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m256i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
+ const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
+ return _mm256_packs_epi32(a_low, a_high);
+#else
+ return _mm256_loadu_si256((const __m256i *)a);
+#endif
+}
+
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+ const __m256i a_lo = _mm256_mullo_epi16(a, one);
+ const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+ const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+ _mm256_storeu_si256((__m256i *)b, a_1);
+ _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+#else
+ _mm256_storeu_si256((__m256i *)b, a);
+#endif
+}
+#endif // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm
new file mode 100644
index 0000000000..aacf71f7ac
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm
@@ -0,0 +1,90 @@
+;
+; Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm.
+; vpx_config.asm is not guarded so can not be included twice. Because this will
+; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be
+; included after those files.
+
+; Increment register by sizeof() tran_low_t * 8.
+%macro INCREMENT_TRAN_LOW 1
+%if CONFIG_VP9_HIGHBITDEPTH
+ add %1, 32
+%else
+ add %1, 16
+%endif
+%endmacro
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea %1, [%1 + %2 * 4]
+%else
+ lea %1, [%1 + %2 * 2]
+%endif
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bytes.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova m%1, [%2 + (%3) * 4]
+ packssdw m%1, [%2 + (%3) * 4 + 16]
+%else
+ mova m%1, [%2 + (%3) * 2]
+%endif
+%endmacro
+
+; Store m%1 to %2 + %3.
+; %3 is the offset in elements, not bytes.
+; If 5 arguments are provided then m%1 is corrupted.
+; If 6 arguments are provided then m%1 is preserved.
+; If tran_low_t is 16 bits (low bit depth configuration) then store the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
+; extend the values first.
+; Uses m%4-m%6 as scratch registers for high bit depth.
+%macro STORE_TRAN_LOW 5-6
+%if CONFIG_VP9_HIGHBITDEPTH
+ pxor m%4, m%4
+ mova m%5, m%1
+ %if %0 == 6
+ mova m%6, m%1
+ %endif
+ pcmpgtw m%4, m%1
+ punpcklwd m%5, m%4
+ %if %0 == 5
+ punpckhwd m%1, m%4
+ %else
+ punpckhwd m%6, m%4
+ %endif
+ mova [%2 + (%3) * 4 + 0], m%5
+ %if %0 == 5
+ mova [%2 + (%3) * 4 + 16], m%1
+ %else
+ mova [%2 + (%3) * 4 + 16], m%6
+ %endif
+%else
+ mova [%2 + (%3) * 2], m%1
+%endif
+%endmacro
+
+; Store zeros (in m%1) to %2 + %3.
+; %3 is the offset in elements, not bytes.
+%macro STORE_ZERO_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova [%2 + (%3) * 4 + 0], m%1
+ mova [%2 + (%3) * 4 + 16], m%1
+%else
+ mova [%2 + (%3) * 2], m%1
+%endif
+%endmacro
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
new file mode 100644
index 0000000000..74dde656b1
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
+#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
+
+#include <xmmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+#else
+ return _mm_load_si128((const __m128i *)a);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_hi = _mm_mulhi_epi16(a, one);
+ const __m128i a_lo = _mm_mullo_epi16(a, one);
+ const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+ const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+ _mm_store_si128((__m128i *)(b), a_1);
+ _mm_store_si128((__m128i *)(b + 4), a_2);
+#else
+ _mm_store_si128((__m128i *)(b), a);
+#endif
+}
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+ const __m128i zero = _mm_setzero_si128();
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(a), zero);
+ _mm_store_si128((__m128i *)(a + 4), zero);
+#else
+ _mm_store_si128((__m128i *)(a), zero);
+#endif
+}
+#endif // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve.h
new file mode 100644
index 0000000000..c339600556
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve.h
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/compiler_attributes.h"
+
+// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty
+// hacky and awful to read. Note that there is a filter_x[3] == 128 check in
+// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function
+// assumes the filter is always 8 tap.
+typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height, const int16_t *filter);
+
+// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we
+// have 4-tap vert avg filter.
+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \
+ void vpx_convolve8_##name##_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
+ const int16_t *filter_row = filter[offset]; \
+ (void)x0_q4; \
+ (void)x_step_q4; \
+ (void)y0_q4; \
+ (void)y_step_q4; \
+ assert(filter_row[3] != 128); \
+ assert(step_q4 == 16); \
+ if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
+ const int num_taps = 8; \
+ while (w >= 16) { \
+ vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ if (w == 8) { \
+ vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } else if (w == 4) { \
+ vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } \
+ (void)num_taps; \
+ } else if (filter_row[2] | filter_row[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
+ while (w >= 16) { \
+ vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ if (w == 8) { \
+ vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } else if (w == 4) { \
+ vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } \
+ (void)num_taps; \
+ } else { \
+ const int num_taps = 2; \
+ while (w >= 16) { \
+ vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ if (w == 8) { \
+ vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } else if (w == 4) { \
+ vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } \
+ (void)num_taps; \
+ } \
+ }
+
+#define FUN_CONV_2D(avg, opt, is_avg) \
+ void vpx_convolve8_##avg##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
+ const int16_t *filter_x = filter[x0_q4]; \
+ const int16_t *filter_y = filter[y0_q4]; \
+ (void)filter_y; \
+ assert(filter_x[3] != 128); \
+ assert(filter_y[3] != 128); \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ assert(x_step_q4 == 16); \
+ assert(y_step_q4 == 16); \
+ if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \
+ vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
+ h + 7); \
+ vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+ filter, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h); \
+ } else if (filter_x[2] | filter_x[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \
+ vpx_convolve8_horiz_##opt( \
+ src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \
+ vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \
+ dst, dst_stride, filter, x0_q4, \
+ x_step_q4, y0_q4, y_step_q4, w, h); \
+ } else { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED); \
+ vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \
+ x_step_q4, y0_q4, y_step_q4, w, h + 1); \
+ vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
+ h); \
+ } \
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
+ const ptrdiff_t src_pitch,
+ uint16_t *output_ptr,
+ ptrdiff_t out_pitch,
+ unsigned int output_height,
+ const int16_t *filter, int bd);
+
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, \
+ is_avg) \
+ void vpx_highbd_convolve8_##name##_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
+ const int16_t *filter_row = filter_kernel[offset]; \
+ if (step_q4 == 16 && filter_row[3] != 128) { \
+ if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
+ const int num_taps = 8; \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ (void)num_taps; \
+ } else if (filter_row[2] | filter_row[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ (void)num_taps; \
+ } else { \
+ const int num_taps = 2; \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ (void)num_taps; \
+ } \
+ } \
+ if (w) { \
+ vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+ filter_kernel, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h, bd); \
+ } \
+ }
+
+#define HIGH_FUN_CONV_2D(avg, opt, is_avg) \
+ void vpx_highbd_convolve8_##avg##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
+ const int16_t *filter_x = filter[x0_q4]; \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \
+ filter_x[3] == 128) { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \
+ vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+ fdata2, 64, filter, x0_q4, x_step_q4, \
+ y0_q4, y_step_q4, w, h + 7, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt( \
+ fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \
+ y0_q4, y_step_q4, w, h, bd); \
+ } else if (filter_x[2] | filter_x[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \
+ vpx_highbd_convolve8_horiz_##opt( \
+ src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \
+ bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt( \
+ fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \
+ } else { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED); \
+ vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, \
+ w, h + 1, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+ filter, x0_q4, x_step_q4, \
+ y0_q4, y_step_q4, w, h, bd); \
+ } \
+ } else { \
+ vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \
+ bd); \
+ } \
+ }
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_X86_CONVOLVE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000000..ebee964b18
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
+
+#include <immintrin.h> // AVX2
+
+#include "./vpx_config.h"
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) || \
+ (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+ (defined(__APPLE__) && defined(__apple_build_version__) && \
+ ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
+ (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // gcc <= 4.6
+#else // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // __clang__
+
+static INLINE void shuffle_filter_avx2(const int16_t *const filter,
+ __m256i *const f) {
+ const __m256i f_values =
+ MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter));
+ // pack and duplicate the filter values
+ f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u));
+ f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u));
+ f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u));
+ f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
+ const __m256i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m256i k_64 = _mm256_set1_epi16(1 << 6);
+ const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]);
+ const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
+ const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
+ const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
+ __m256i sum1, sum2;
+
+ // sum the results together, saturating only on the final step
+ // adding x0 with x2 and x1 with x3 is the only order that prevents
+ // outranges for all filters
+ sum1 = _mm256_add_epi16(x0, x2);
+ sum2 = _mm256_add_epi16(x1, x3);
+ // add the rounding offset early to avoid another saturated add
+ sum1 = _mm256_add_epi16(sum1, k_64);
+ sum1 = _mm256_adds_epi16(sum1, sum2);
+ // round and shift by 7 bit each 16 bit
+ sum1 = _mm256_srai_epi16(sum1, 7);
+ return sum1;
+}
+
+static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
+ const __m256i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]),
+ _mm256_castsi256_si128(f[0]));
+ const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]),
+ _mm256_castsi256_si128(f[1]));
+ const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]),
+ _mm256_castsi256_si128(f[2]));
+ const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
+ _mm256_castsi256_si128(f[3]));
+ __m128i sum1, sum2;
+
+ // sum the results together, saturating only on the final step
+ // adding x0 with x2 and x1 with x3 is the only order that prevents
+ // outranges for all filters
+ sum1 = _mm_add_epi16(x0, x2);
+ sum2 = _mm_add_epi16(x1, x3);
+ // add the rounding offset early to avoid another saturated add
+ sum1 = _mm_add_epi16(sum1, k_64);
+ sum1 = _mm_adds_epi16(sum1, sum2);
+ // shift by 7 bit each 16 bit
+ sum1 = _mm_srai_epi16(sum1, 7);
+ return sum1;
+}
+
+static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) {
+ const __m256i tmp =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo));
+ return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1);
+}
+
+static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) {
+ const __m256i tmp =
+ _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo));
+ return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1);
+}
+
+static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1,
+ __m128i *const dst_ptr_2,
+ const __m256i *const src) {
+ _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src));
+ _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
+ __m128i *const dst_ptr_2,
+ const __m256i *const src) {
+ _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src));
+ _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
+ __m128i *const dst_ptr_2,
+ const __m256i *const src) {
+ *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
+ *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE __m256i mm256_round_epi32(const __m256i *const src,
+ const __m256i *const half_depth,
+ const int depth) {
+ const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth);
+ return _mm256_srai_epi32(nearest_src, depth);
+}
+
+static INLINE __m256i mm256_round_epi16(const __m256i *const src,
+ const __m256i *const half_depth,
+ const int depth) {
+ const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth);
+ return _mm256_srai_epi16(nearest_src, depth);
+}
+
+static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0,
+ const __m256i *const src_1,
+ const __m256i *const ker_0,
+ const __m256i *const ker_1) {
+ const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0);
+ const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1);
+ return _mm256_add_epi32(tmp_0, tmp_1);
+}
+
+#undef MM256_BROADCASTSI128_SI256
+
+#endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h
new file mode 100644
index 0000000000..8443546394
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+
+// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
+// values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words
+static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) {
+ __m128i tmp = _mm_unpacklo_epi32(*reg, *reg);
+ return _mm_unpackhi_epi64(tmp, tmp);
+}
+
+// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
+// values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words.
+static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) {
+ __m128i tmp = _mm_unpackhi_epi32(*reg, *reg);
+ return _mm_unpacklo_epi64(tmp, tmp);
+}
+
+// Interprets src as 8-bit words, zero extends to form 16-bit words, then
+// multiplies with ker and add the adjacent results to form 32-bit words.
+// Finally adds the result from 1 and 2 together.
+static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1,
+ const __m128i *const src_2,
+ const __m128i *const ker_1,
+ const __m128i *const ker_2) {
+ const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128());
+ const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128());
+ const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1);
+ const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2);
+ return _mm_add_epi32(madd_1, madd_2);
+}
+
+// Interprets src as 16-bit words, then multiplies with ker and add the
+// adjacent results to form 32-bit words. Finally adds the result from 1 and 2
+// together.
+static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1,
+ const __m128i *const src_2,
+ const __m128i *const ker_1,
+ const __m128i *const ker_2) {
+ const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1);
+ const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2);
+ return _mm_add_epi32(madd_1, madd_2);
+}
+
+static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0,
+ const __m128i *const src_1,
+ const __m128i *const ker) {
+ const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker);
+ const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker);
+ return _mm_packs_epi32(madd_1, madd_2);
+}
+
+// Interleaves src_1 and src_2
+static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1,
+ const __m128i *const src_2) {
+ const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2);
+ const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2);
+ return _mm_packs_epi32(tmp_1, tmp_2);
+}
+
+static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src,
+ const __m128i *const half_depth,
+ const int depth) {
+ const __m128i nearest_src = _mm_add_epi32(*src, *half_depth);
+ return _mm_srai_epi32(nearest_src, depth);
+}
+
+static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src,
+ const __m128i *const half_depth,
+ const int depth) {
+ const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth);
+ return _mm_srai_epi16(nearest_src, depth);
+}
+
+#endif // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h
new file mode 100644
index 0000000000..8a4b165133
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
+
+#include <assert.h>
+#include <tmmintrin.h> // SSSE3
+
+#include "./vpx_config.h"
+
+static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+ // half of f[0] and f[4].
+ assert(filter[3] >= 0 && filter[3] < 256);
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+ f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
+static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ __m128i sum1, sum2;
+
+ // sum the results together, saturating only on the final step
+ // adding x0 with x2 and x1 with x3 is the only order that prevents
+ // outranges for all filters
+ sum1 = _mm_add_epi16(x0, x2);
+ sum2 = _mm_add_epi16(x1, x3);
+ // add the rounding offset early to avoid another saturated add
+ sum1 = _mm_add_epi16(sum1, k_64);
+ sum1 = _mm_adds_epi16(sum1, sum2);
+ // shift by 7 bit each 16 bit
+ sum1 = _mm_srai_epi16(sum1, 7);
+ return sum1;
+}
+
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ // compensate the subtracted 64 in f[1]. x4 is always non negative.
+ const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+ // add and saturate the results together
+ __m128i temp = _mm_adds_epi16(x0, x3);
+ temp = _mm_adds_epi16(temp, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x4);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+ // compensate the subtracted 64 in f[2]. x5 is always non negative.
+ const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+ __m128i temp;
+
+ // add and saturate the results together
+ temp = _mm_adds_epi16(x0, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x3);
+ temp = _mm_adds_epi16(temp, x4);
+ temp = _mm_adds_epi16(temp, x5);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+#endif // VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm
new file mode 100644
index 0000000000..b3af677d2e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm
@@ -0,0 +1,432 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;macro in deblock functions
+%macro FIRST_2_ROWS 0
+ movdqa xmm4, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm5, xmm1
+ pavgb xmm5, xmm3
+
+ ;calculate absolute value
+ psubusb xmm4, xmm1
+ psubusb xmm1, xmm0
+ psubusb xmm6, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm4, xmm1
+ paddusb xmm6, xmm3
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm7, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm4
+ psubusb xmm7, xmm6
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm7, xmm1
+ por xmm7, xmm2
+%endmacro
+
+%macro SECOND_2_ROWS 0
+ movdqa xmm6, xmm0
+ movdqa xmm4, xmm0
+ movdqa xmm2, xmm1
+ pavgb xmm1, xmm3
+
+ ;calculate absolute value
+ psubusb xmm6, xmm2
+ psubusb xmm2, xmm0
+ psubusb xmm4, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm6, xmm2
+ paddusb xmm4, xmm3
+
+ pavgb xmm5, xmm1
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm3, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm6
+ psubusb xmm3, xmm4
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm3, xmm1
+
+ por xmm7, xmm2
+ por xmm7, xmm3
+
+ pavgb xmm5, xmm0
+
+ ;decide if or not to use filtered value
+ pand xmm0, xmm7
+ pandn xmm7, xmm5
+ paddusb xmm0, xmm7
+%endmacro
+
+%macro UPDATE_FLIMIT 0
+ movdqu xmm2, XMMWORD PTR [rbx]
+ movdqu [rsp], xmm2
+ add rbx, 16
+%endmacro
+
+SECTION .text
+
+;void vpx_post_proc_down_and_across_mb_row_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned char *dst_ptr,
+; int src_pixels_per_line,
+; int dst_pixels_per_line,
+; int cols,
+; int *flimits,
+; int size
+;)
+globalsym(vpx_post_proc_down_and_across_mb_row_sse2)
+sym(vpx_post_proc_down_and_across_mb_row_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+ ALIGN_STACK 16, rax
+ sub rsp, 16
+
+ ; put flimit on stack
+ mov rbx, arg(5) ;flimits ptr
+ UPDATE_FLIMIT
+
+%define flimit [rsp]
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;dst_ptr
+
+ movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
+ movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
+.nextrow:
+ xor rdx, rdx ;col
+.nextcol:
+ ;load current and next 2 rows
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
+
+ FIRST_2_ROWS
+
+ ;load above 2 rows
+ neg rax
+ movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax]
+
+ SECOND_2_ROWS
+
+ movdqu XMMWORD PTR [rdi], xmm0
+
+ neg rax ; positive stride
+ add rsi, 16
+ add rdi, 16
+
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
+ jge .downdone
+ UPDATE_FLIMIT
+ jmp .nextcol
+
+.downdone:
+ ; done with the all cols, start the across filtering in place
+ sub rsi, rdx
+ sub rdi, rdx
+
+ mov rbx, arg(5) ; flimits
+ UPDATE_FLIMIT
+
+ ; dup the first byte into the left border 8 times
+ movq mm1, [rdi]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ mov rdx, -8
+ movq [rdi+rdx], mm1
+
+ ; dup the last byte into the right border
+ movsxd rdx, dword arg(4)
+ movq mm1, [rdi + rdx + -1]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ movq [rdi+rdx], mm1
+
+ xor rdx, rdx
+ movq mm0, QWORD PTR [rdi-16];
+ movq mm1, QWORD PTR [rdi-8];
+
+.acrossnextcol:
+ movdqu xmm0, XMMWORD PTR [rdi + rdx]
+ movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
+
+ FIRST_2_ROWS
+
+ movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
+
+ SECOND_2_ROWS
+
+ movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
+ movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
+ movdq2q mm0, xmm0
+ psrldq xmm0, 8
+ movdq2q mm1, xmm0
+
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
+ jge .acrossdone
+ UPDATE_FLIMIT
+ jmp .acrossnextcol
+
+.acrossdone:
+ ; last 16 pixels
+ movq QWORD PTR [rdi+rdx-16], mm0
+
+ cmp edx, dword arg(4)
+ jne .throw_last_8
+ movq QWORD PTR [rdi+rdx-8], mm1
+.throw_last_8:
+ ; done with this rwo
+ add rsi,rax ;next src line
+ mov eax, dword arg(3) ;dst_pixels_per_line
+ add rdi,rax ;next destination
+ mov eax, dword arg(2) ;src_pixels_per_line
+
+ mov rbx, arg(5) ;flimits
+ UPDATE_FLIMIT
+
+ dec rcx ;decrement count
+ jnz .nextrow ;next row
+
+ add rsp, 16
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit
+
+
+;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
+; int pitch, int rows, int cols,int flimit)
+globalsym(vpx_mbpost_proc_across_ip_sse2)
+sym(vpx_mbpost_proc_across_ip_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16
+
+ ; create flimit4 at [rsp]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp], eax
+ mov [rsp+4], eax
+ mov [rsp+8], eax
+ mov [rsp+12], eax
+%define flimit4 [rsp]
+
+
+ ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+ xor rdx, rdx ;sumsq=0;
+ xor rcx, rcx ;sum=0;
+ mov rsi, arg(0); s
+
+
+ ; dup the first byte into the left border 8 times
+ movq mm1, [rsi]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+
+ mov rdi, -8
+ movq [rsi+rdi], mm1
+
+ ; dup the last byte into the right border
+ movsxd rdx, dword arg(3)
+ movq mm1, [rsi + rdx + -1]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ movq [rsi+rdx], mm1
+
+.ip_var_loop:
+ ;for(i=-8;i<=6;i++)
+ ;{
+ ; sumsq += s[i]*s[i];
+ ; sum += s[i];
+ ;}
+ movzx eax, byte [rsi+rdi]
+ add ecx, eax
+ mul al
+ add edx, eax
+ add rdi, 1
+ cmp rdi, 6
+ jle .ip_var_loop
+
+
+ ;mov rax, sumsq
+ ;movd xmm7, rax
+ movd xmm7, edx
+
+ ;mov rax, sum
+ ;movd xmm6, rax
+ movd xmm6, ecx
+
+ mov rsi, arg(0) ;s
+ xor rcx, rcx
+
+ movsxd rdx, dword arg(3) ;cols
+ add rdx, 8
+ pxor mm0, mm0
+ pxor mm1, mm1
+
+ pxor xmm0, xmm0
+.nextcol4:
+
+ movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
+ movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
+
+ punpcklbw xmm1, xmm0 ; expanding
+ punpcklbw xmm2, xmm0 ; expanding
+
+ punpcklwd xmm1, xmm0 ; expanding to dwords
+ punpcklwd xmm2, xmm0 ; expanding to dwords
+
+ psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
+ paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
+
+ paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
+ pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
+
+ paddd xmm6, xmm2
+ paddd xmm7, xmm1
+
+ pshufd xmm6, xmm6, 0 ; duplicate the last ones
+ pshufd xmm7, xmm7, 0 ; duplicate the last ones
+
+ psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
+ psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
+
+ pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
+ pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm3
+
+ pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
+ pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
+ pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ movdqa xmm3, xmm6
+ pmaddwd xmm3, xmm3
+
+ movdqa xmm5, xmm7
+ pslld xmm5, 4
+
+ psubd xmm5, xmm7
+ psubd xmm5, xmm3
+
+ psubd xmm5, flimit4
+ psrad xmm5, 31
+
+ packssdw xmm5, xmm0
+ packsswb xmm5, xmm0
+
+ movd xmm1, DWORD PTR [rsi+rcx]
+ movq xmm2, xmm1
+
+ punpcklbw xmm1, xmm0
+ punpcklwd xmm1, xmm0
+
+ paddd xmm1, xmm6
+ paddd xmm1, [GLOBAL(four8s)]
+
+ psrad xmm1, 4
+ packssdw xmm1, xmm0
+
+ packuswb xmm1, xmm0
+ pand xmm1, xmm5
+
+ pandn xmm5, xmm2
+ por xmm5, xmm1
+
+ movd [rsi+rcx-8], mm0
+ movq mm0, mm1
+
+ movdq2q mm1, xmm5
+ psrldq xmm7, 12
+
+ psrldq xmm6, 12
+ add rcx, 4
+
+ cmp rcx, rdx
+ jl .nextcol4
+
+ ;s+=pitch;
+ movsxd rax, dword arg(1)
+ add arg(0), rax
+
+ sub dword arg(2), 1 ;rows-=1
+ cmp dword arg(2), 0
+ jg .ip_row_loop
+
+ add rsp, 16
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit4
+
+
+SECTION_RODATA
+align 16
+four8s:
+ times 4 dd 8
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
new file mode 100644
index 0000000000..f3a8020292
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -0,0 +1,2930 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define pair256_set_epi16(a, b) \
+ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+#define pair256_set_epi32(a, b) \
+ _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
+ (int)(b), (int)(a))
+
+#if FDCT32x32_HIGH_PRECISION
+static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
+ __m256i buf0, buf1;
+ buf0 = _mm256_mul_epu32(a, b);
+ a = _mm256_srli_epi64(a, 32);
+ b = _mm256_srli_epi64(b, 32);
+ buf1 = _mm256_mul_epu32(a, b);
+ return _mm256_add_epi64(buf0, buf1);
+}
+
+static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
+ __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+ __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+ return _mm256_unpacklo_epi64(buf0, buf1);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
+ // Calculate pre-multiplied strides
+ const int str1 = stride;
+ const int str2 = 2 * stride;
+ const int str3 = 2 * stride + str1;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
+ const __m256i k__cospi_p16_m16 =
+ pair256_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m256i k__cospi_m12_m20 =
+ pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ const __m256i kZero = _mm256_setzero_si256();
+ const __m256i kOne = _mm256_set1_epi16(1);
+ // Do the two transform/transpose passes
+ int pass;
+ for (pass = 0; pass < 2; ++pass) {
+ // We process sixteen columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 32; column_start += 16) {
+ __m256i step1[32];
+ __m256i step2[32];
+ __m256i step3[32];
+ __m256i out[32];
+ // Stage 1
+ // Note: even though all the loads below are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (0 == pass) {
+ const int16_t *in = &input[column_start];
+ // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
+ __m256i *step1a = &step1[0];
+ __m256i *step1b = &step1[31];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
+ __m256i *step1a = &step1[4];
+ __m256i *step1b = &step1[27];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
+ __m256i *step1a = &step1[8];
+ __m256i *step1b = &step1[23];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
+ __m256i *step1a = &step1[12];
+ __m256i *step1b = &step1[19];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ } else {
+ int16_t *in = &intermediate[column_start];
+ // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
+ // Note: using the same approach as above to have common offset is
+ // counter-productive as all offsets can be calculated at compile
+ // time.
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32));
+ __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32));
+ __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32));
+ __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32));
+ __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
+ __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
+ __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
+ __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
+ step1[0] = _mm256_add_epi16(in00, in31);
+ step1[1] = _mm256_add_epi16(in01, in30);
+ step1[2] = _mm256_add_epi16(in02, in29);
+ step1[3] = _mm256_add_epi16(in03, in28);
+ step1[28] = _mm256_sub_epi16(in03, in28);
+ step1[29] = _mm256_sub_epi16(in02, in29);
+ step1[30] = _mm256_sub_epi16(in01, in30);
+ step1[31] = _mm256_sub_epi16(in00, in31);
+ }
+ {
+ __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32));
+ __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32));
+ __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32));
+ __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32));
+ __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
+ __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
+ __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
+ __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
+ step1[4] = _mm256_add_epi16(in04, in27);
+ step1[5] = _mm256_add_epi16(in05, in26);
+ step1[6] = _mm256_add_epi16(in06, in25);
+ step1[7] = _mm256_add_epi16(in07, in24);
+ step1[24] = _mm256_sub_epi16(in07, in24);
+ step1[25] = _mm256_sub_epi16(in06, in25);
+ step1[26] = _mm256_sub_epi16(in05, in26);
+ step1[27] = _mm256_sub_epi16(in04, in27);
+ }
+ {
+ __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32));
+ __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32));
+ __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
+ __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
+ __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
+ __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
+ __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
+ __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
+ step1[8] = _mm256_add_epi16(in08, in23);
+ step1[9] = _mm256_add_epi16(in09, in22);
+ step1[10] = _mm256_add_epi16(in10, in21);
+ step1[11] = _mm256_add_epi16(in11, in20);
+ step1[20] = _mm256_sub_epi16(in11, in20);
+ step1[21] = _mm256_sub_epi16(in10, in21);
+ step1[22] = _mm256_sub_epi16(in09, in22);
+ step1[23] = _mm256_sub_epi16(in08, in23);
+ }
+ {
+ __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
+ __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
+ __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
+ __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
+ __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
+ __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
+ __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
+ __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
+ step1[12] = _mm256_add_epi16(in12, in19);
+ step1[13] = _mm256_add_epi16(in13, in18);
+ step1[14] = _mm256_add_epi16(in14, in17);
+ step1[15] = _mm256_add_epi16(in15, in16);
+ step1[16] = _mm256_sub_epi16(in15, in16);
+ step1[17] = _mm256_sub_epi16(in14, in17);
+ step1[18] = _mm256_sub_epi16(in13, in18);
+ step1[19] = _mm256_sub_epi16(in12, in19);
+ }
+ }
+ // Stage 2
+ {
+ step2[0] = _mm256_add_epi16(step1[0], step1[15]);
+ step2[1] = _mm256_add_epi16(step1[1], step1[14]);
+ step2[2] = _mm256_add_epi16(step1[2], step1[13]);
+ step2[3] = _mm256_add_epi16(step1[3], step1[12]);
+ step2[4] = _mm256_add_epi16(step1[4], step1[11]);
+ step2[5] = _mm256_add_epi16(step1[5], step1[10]);
+ step2[6] = _mm256_add_epi16(step1[6], step1[9]);
+ step2[7] = _mm256_add_epi16(step1[7], step1[8]);
+ step2[8] = _mm256_sub_epi16(step1[7], step1[8]);
+ step2[9] = _mm256_sub_epi16(step1[6], step1[9]);
+ step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
+ step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
+ step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
+ step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
+ step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
+ step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
+ }
+ {
+ const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
+ const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
+ const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
+ const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
+ const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
+ const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
+ const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
+ const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
+ const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s2_20_4 =
+ _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_20_5 =
+ _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_21_4 =
+ _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_21_5 =
+ _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_22_4 =
+ _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_22_5 =
+ _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_23_4 =
+ _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_23_5 =
+ _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_24_4 =
+ _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_24_5 =
+ _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_25_4 =
+ _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_25_5 =
+ _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_26_4 =
+ _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_26_5 =
+ _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_27_4 =
+ _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_27_5 =
+ _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
+ }
+
+#if !FDCT32x32_HIGH_PRECISION
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]);
+ __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]);
+ __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]);
+ __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]);
+ __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]);
+ __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]);
+ __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]);
+ __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]);
+ __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]);
+ __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]);
+ __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]);
+ __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]);
+ __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]);
+ __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]);
+ __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]);
+ __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]);
+ __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]);
+ __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]);
+ __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]);
+ __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]);
+ __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]);
+ __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]);
+ __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]);
+ __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]);
+ __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]);
+ __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]);
+ __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]);
+ __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]);
+ __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]);
+ __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]);
+ __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]);
+ __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]);
+
+ step2[0] = _mm256_sub_epi16(step2[0], s3_00_0);
+ step2[1] = _mm256_sub_epi16(step2[1], s3_01_0);
+ step2[2] = _mm256_sub_epi16(step2[2], s3_02_0);
+ step2[3] = _mm256_sub_epi16(step2[3], s3_03_0);
+ step2[4] = _mm256_sub_epi16(step2[4], s3_04_0);
+ step2[5] = _mm256_sub_epi16(step2[5], s3_05_0);
+ step2[6] = _mm256_sub_epi16(step2[6], s3_06_0);
+ step2[7] = _mm256_sub_epi16(step2[7], s3_07_0);
+ step2[8] = _mm256_sub_epi16(step2[8], s2_08_0);
+ step2[9] = _mm256_sub_epi16(step2[9], s2_09_0);
+ step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
+ step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
+ step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
+ step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
+ step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
+ step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
+ step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
+ step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
+ step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
+ step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
+ step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
+ step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
+ step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
+ step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
+ step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
+ step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
+ step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
+ step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
+ step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
+ step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
+ step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
+ step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
+
+ step2[0] = _mm256_add_epi16(step2[0], kOne);
+ step2[1] = _mm256_add_epi16(step2[1], kOne);
+ step2[2] = _mm256_add_epi16(step2[2], kOne);
+ step2[3] = _mm256_add_epi16(step2[3], kOne);
+ step2[4] = _mm256_add_epi16(step2[4], kOne);
+ step2[5] = _mm256_add_epi16(step2[5], kOne);
+ step2[6] = _mm256_add_epi16(step2[6], kOne);
+ step2[7] = _mm256_add_epi16(step2[7], kOne);
+ step2[8] = _mm256_add_epi16(step2[8], kOne);
+ step2[9] = _mm256_add_epi16(step2[9], kOne);
+ step2[10] = _mm256_add_epi16(step2[10], kOne);
+ step2[11] = _mm256_add_epi16(step2[11], kOne);
+ step2[12] = _mm256_add_epi16(step2[12], kOne);
+ step2[13] = _mm256_add_epi16(step2[13], kOne);
+ step2[14] = _mm256_add_epi16(step2[14], kOne);
+ step2[15] = _mm256_add_epi16(step2[15], kOne);
+ step1[16] = _mm256_add_epi16(step1[16], kOne);
+ step1[17] = _mm256_add_epi16(step1[17], kOne);
+ step1[18] = _mm256_add_epi16(step1[18], kOne);
+ step1[19] = _mm256_add_epi16(step1[19], kOne);
+ step2[20] = _mm256_add_epi16(step2[20], kOne);
+ step2[21] = _mm256_add_epi16(step2[21], kOne);
+ step2[22] = _mm256_add_epi16(step2[22], kOne);
+ step2[23] = _mm256_add_epi16(step2[23], kOne);
+ step2[24] = _mm256_add_epi16(step2[24], kOne);
+ step2[25] = _mm256_add_epi16(step2[25], kOne);
+ step2[26] = _mm256_add_epi16(step2[26], kOne);
+ step2[27] = _mm256_add_epi16(step2[27], kOne);
+ step1[28] = _mm256_add_epi16(step1[28], kOne);
+ step1[29] = _mm256_add_epi16(step1[29], kOne);
+ step1[30] = _mm256_add_epi16(step1[30], kOne);
+ step1[31] = _mm256_add_epi16(step1[31], kOne);
+
+ step2[0] = _mm256_srai_epi16(step2[0], 2);
+ step2[1] = _mm256_srai_epi16(step2[1], 2);
+ step2[2] = _mm256_srai_epi16(step2[2], 2);
+ step2[3] = _mm256_srai_epi16(step2[3], 2);
+ step2[4] = _mm256_srai_epi16(step2[4], 2);
+ step2[5] = _mm256_srai_epi16(step2[5], 2);
+ step2[6] = _mm256_srai_epi16(step2[6], 2);
+ step2[7] = _mm256_srai_epi16(step2[7], 2);
+ step2[8] = _mm256_srai_epi16(step2[8], 2);
+ step2[9] = _mm256_srai_epi16(step2[9], 2);
+ step2[10] = _mm256_srai_epi16(step2[10], 2);
+ step2[11] = _mm256_srai_epi16(step2[11], 2);
+ step2[12] = _mm256_srai_epi16(step2[12], 2);
+ step2[13] = _mm256_srai_epi16(step2[13], 2);
+ step2[14] = _mm256_srai_epi16(step2[14], 2);
+ step2[15] = _mm256_srai_epi16(step2[15], 2);
+ step1[16] = _mm256_srai_epi16(step1[16], 2);
+ step1[17] = _mm256_srai_epi16(step1[17], 2);
+ step1[18] = _mm256_srai_epi16(step1[18], 2);
+ step1[19] = _mm256_srai_epi16(step1[19], 2);
+ step2[20] = _mm256_srai_epi16(step2[20], 2);
+ step2[21] = _mm256_srai_epi16(step2[21], 2);
+ step2[22] = _mm256_srai_epi16(step2[22], 2);
+ step2[23] = _mm256_srai_epi16(step2[23], 2);
+ step2[24] = _mm256_srai_epi16(step2[24], 2);
+ step2[25] = _mm256_srai_epi16(step2[25], 2);
+ step2[26] = _mm256_srai_epi16(step2[26], 2);
+ step2[27] = _mm256_srai_epi16(step2[27], 2);
+ step1[28] = _mm256_srai_epi16(step1[28], 2);
+ step1[29] = _mm256_srai_epi16(step1[29], 2);
+ step1[30] = _mm256_srai_epi16(step1[30], 2);
+ step1[31] = _mm256_srai_epi16(step1[31], 2);
+ }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+ if (pass == 0) {
+#endif
+ // Stage 3
+ {
+ step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
+ step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
+ step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
+ step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
+ step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
+ step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
+ step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
+ step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
+ }
+ {
+ const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+ const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+ const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+ const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+ const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s3_10_4 =
+ _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_5 =
+ _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_4 =
+ _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_5 =
+ _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_4 =
+ _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_5 =
+ _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_4 =
+ _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_5 =
+ _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
+ }
+ {
+ step3[16] = _mm256_add_epi16(step2[23], step1[16]);
+ step3[17] = _mm256_add_epi16(step2[22], step1[17]);
+ step3[18] = _mm256_add_epi16(step2[21], step1[18]);
+ step3[19] = _mm256_add_epi16(step2[20], step1[19]);
+ step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
+ step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
+ step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
+ step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
+ step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
+ step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
+ step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
+ step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
+ step3[28] = _mm256_add_epi16(step2[27], step1[28]);
+ step3[29] = _mm256_add_epi16(step2[26], step1[29]);
+ step3[30] = _mm256_add_epi16(step2[25], step1[30]);
+ step3[31] = _mm256_add_epi16(step2[24], step1[31]);
+ }
+
+ // Stage 4
+ {
+ step1[0] = _mm256_add_epi16(step3[3], step3[0]);
+ step1[1] = _mm256_add_epi16(step3[2], step3[1]);
+ step1[2] = _mm256_sub_epi16(step3[1], step3[2]);
+ step1[3] = _mm256_sub_epi16(step3[0], step3[3]);
+ step1[8] = _mm256_add_epi16(step3[11], step2[8]);
+ step1[9] = _mm256_add_epi16(step3[10], step2[9]);
+ step1[10] = _mm256_sub_epi16(step2[9], step3[10]);
+ step1[11] = _mm256_sub_epi16(step2[8], step3[11]);
+ step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
+ step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
+ step1[14] = _mm256_add_epi16(step3[13], step2[14]);
+ step1[15] = _mm256_add_epi16(step3[12], step2[15]);
+ }
+ {
+ const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
+ const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
+ const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s1_05_4 =
+ _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_05_5 =
+ _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_06_4 =
+ _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_06_5 =
+ _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
+ }
+ {
+ const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
+ const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
+ const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
+ const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
+ const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
+ const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
+ const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
+ const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
+ const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m256i s1_18_4 =
+ _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_18_5 =
+ _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_19_4 =
+ _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_19_5 =
+ _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_20_4 =
+ _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_20_5 =
+ _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_21_4 =
+ _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_21_5 =
+ _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_26_4 =
+ _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_26_5 =
+ _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_27_4 =
+ _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_27_5 =
+ _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_28_4 =
+ _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_28_5 =
+ _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_29_4 =
+ _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_29_5 =
+ _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
+ }
+ // Stage 5
+ {
+ step2[4] = _mm256_add_epi16(step1[5], step3[4]);
+ step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
+ step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
+ step2[7] = _mm256_add_epi16(step1[6], step3[7]);
+ }
+ {
+ const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
+ const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
+ const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
+ const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
+ const __m256i out_00_2 =
+ _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m256i out_00_3 =
+ _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m256i out_16_2 =
+ _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m256i out_16_3 =
+ _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m256i out_08_2 =
+ _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m256i out_08_3 =
+ _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m256i out_24_2 =
+ _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m256i out_24_3 =
+ _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m256i out_00_4 =
+ _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_00_5 =
+ _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_16_4 =
+ _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_16_5 =
+ _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_08_4 =
+ _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_08_5 =
+ _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_24_4 =
+ _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_24_5 =
+ _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[0] = _mm256_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
+ out[8] = _mm256_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
+ }
+ {
+ const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]);
+ const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]);
+ const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
+ const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
+ const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m256i s2_09_4 =
+ _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_09_5 =
+ _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_10_4 =
+ _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_10_5 =
+ _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_13_4 =
+ _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_13_5 =
+ _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_14_4 =
+ _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_14_5 =
+ _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
+ }
+ {
+ step2[16] = _mm256_add_epi16(step1[19], step3[16]);
+ step2[17] = _mm256_add_epi16(step1[18], step3[17]);
+ step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
+ step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
+ step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
+ step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
+ step2[22] = _mm256_add_epi16(step1[21], step3[22]);
+ step2[23] = _mm256_add_epi16(step1[20], step3[23]);
+ step2[24] = _mm256_add_epi16(step1[27], step3[24]);
+ step2[25] = _mm256_add_epi16(step1[26], step3[25]);
+ step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
+ step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
+ step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
+ step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
+ step2[30] = _mm256_add_epi16(step1[29], step3[30]);
+ step2[31] = _mm256_add_epi16(step1[28], step3[31]);
+ }
+ // Stage 6
+ {
+ const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+ const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+ const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+ const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+ const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+ const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+ const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+ const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+ const __m256i out_04_2 =
+ _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m256i out_04_3 =
+ _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m256i out_20_2 =
+ _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m256i out_20_3 =
+ _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m256i out_12_2 =
+ _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m256i out_12_3 =
+ _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m256i out_28_2 =
+ _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m256i out_28_3 =
+ _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m256i out_04_4 =
+ _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_04_5 =
+ _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_20_4 =
+ _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_20_5 =
+ _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_12_4 =
+ _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_12_5 =
+ _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_28_4 =
+ _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_28_5 =
+ _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[4] = _mm256_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
+ }
+ {
+ step3[8] = _mm256_add_epi16(step2[9], step1[8]);
+ step3[9] = _mm256_sub_epi16(step1[8], step2[9]);
+ step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
+ step3[11] = _mm256_add_epi16(step2[10], step1[11]);
+ step3[12] = _mm256_add_epi16(step2[13], step1[12]);
+ step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
+ step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
+ step3[15] = _mm256_add_epi16(step2[14], step1[15]);
+ }
+ {
+ const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
+ const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
+ const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
+ const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
+ const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
+ const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
+ const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
+ const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
+ const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m256i s3_17_4 =
+ _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_17_5 =
+ _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_18_4 =
+ _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_18_5 =
+ _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_21_4 =
+ _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_21_5 =
+ _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_22_4 =
+ _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_22_5 =
+ _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m256i s3_25_4 =
+ _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_25_5 =
+ _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_26_4 =
+ _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_26_5 =
+ _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_29_4 =
+ _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_29_5 =
+ _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_30_4 =
+ _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_30_5 =
+ _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
+ }
+ // Stage 7
+ {
+ const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]);
+ const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]);
+ const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]);
+ const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]);
+ const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
+ const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
+ const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
+ const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
+ const __m256i out_02_2 =
+ _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m256i out_02_3 =
+ _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m256i out_18_2 =
+ _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m256i out_18_3 =
+ _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m256i out_10_2 =
+ _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m256i out_10_3 =
+ _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m256i out_26_2 =
+ _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m256i out_26_3 =
+ _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m256i out_06_2 =
+ _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m256i out_06_3 =
+ _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m256i out_22_2 =
+ _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m256i out_22_3 =
+ _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m256i out_14_2 =
+ _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m256i out_14_3 =
+ _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m256i out_30_2 =
+ _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m256i out_30_3 =
+ _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m256i out_02_4 =
+ _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_02_5 =
+ _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_18_4 =
+ _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_18_5 =
+ _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_10_4 =
+ _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_10_5 =
+ _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_26_4 =
+ _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_26_5 =
+ _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_06_4 =
+ _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_06_5 =
+ _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_22_4 =
+ _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_22_5 =
+ _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_14_4 =
+ _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_14_5 =
+ _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_30_4 =
+ _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_30_5 =
+ _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[2] = _mm256_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
+ out[6] = _mm256_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
+ }
+ {
+ step1[16] = _mm256_add_epi16(step3[17], step2[16]);
+ step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
+ step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
+ step1[19] = _mm256_add_epi16(step3[18], step2[19]);
+ step1[20] = _mm256_add_epi16(step3[21], step2[20]);
+ step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
+ step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
+ step1[23] = _mm256_add_epi16(step3[22], step2[23]);
+ step1[24] = _mm256_add_epi16(step3[25], step2[24]);
+ step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
+ step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
+ step1[27] = _mm256_add_epi16(step3[26], step2[27]);
+ step1[28] = _mm256_add_epi16(step3[29], step2[28]);
+ step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
+ step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
+ step1[31] = _mm256_add_epi16(step3[30], step2[31]);
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
+ const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
+ const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
+ const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
+ const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
+ const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
+ const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
+ const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
+ const __m256i out_01_2 =
+ _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m256i out_01_3 =
+ _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m256i out_17_2 =
+ _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m256i out_17_3 =
+ _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m256i out_09_2 =
+ _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m256i out_09_3 =
+ _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m256i out_25_2 =
+ _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m256i out_25_3 =
+ _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m256i out_07_2 =
+ _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m256i out_07_3 =
+ _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m256i out_23_2 =
+ _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m256i out_23_3 =
+ _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m256i out_15_2 =
+ _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m256i out_15_3 =
+ _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m256i out_31_2 =
+ _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m256i out_31_3 =
+ _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m256i out_01_4 =
+ _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_01_5 =
+ _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_17_4 =
+ _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_17_5 =
+ _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_09_4 =
+ _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_09_5 =
+ _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_25_4 =
+ _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_25_5 =
+ _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_07_4 =
+ _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_07_5 =
+ _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_23_4 =
+ _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_23_5 =
+ _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_15_4 =
+ _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_15_5 =
+ _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_31_4 =
+ _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_31_5 =
+ _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[1] = _mm256_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
+ out[9] = _mm256_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
+ out[7] = _mm256_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
+ }
+ {
+ const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
+ const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
+ const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
+ const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
+ const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
+ const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
+ const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
+ const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
+ const __m256i out_05_2 =
+ _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m256i out_05_3 =
+ _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m256i out_21_2 =
+ _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m256i out_21_3 =
+ _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m256i out_13_2 =
+ _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m256i out_13_3 =
+ _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m256i out_29_2 =
+ _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m256i out_29_3 =
+ _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m256i out_03_2 =
+ _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m256i out_03_3 =
+ _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m256i out_19_2 =
+ _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m256i out_19_3 =
+ _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m256i out_11_2 =
+ _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m256i out_11_3 =
+ _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m256i out_27_2 =
+ _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m256i out_27_3 =
+ _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m256i out_05_4 =
+ _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_05_5 =
+ _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_21_4 =
+ _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_21_5 =
+ _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_13_4 =
+ _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_13_5 =
+ _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_29_4 =
+ _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_29_5 =
+ _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_03_4 =
+ _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_03_5 =
+ _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_19_4 =
+ _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_19_5 =
+ _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_11_4 =
+ _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_11_5 =
+ _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_27_4 =
+ _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_27_5 =
+ _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[5] = _mm256_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
+ out[3] = _mm256_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
+ }
+#if FDCT32x32_HIGH_PRECISION
+ } else {
+ __m256i lstep1[64], lstep2[64], lstep3[64];
+ __m256i u[32], v[32], sign[16];
+ const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+ const __m256i k__pOne_mOne = pair256_set_epi16(1, -1);
+ // start using 32-bit operations
+ // stage 3
+ {
+ // expanding to 32-bit length while adding and subtracting
+ lstep2[0] = _mm256_unpacklo_epi16(step2[0], step2[7]);
+ lstep2[1] = _mm256_unpackhi_epi16(step2[0], step2[7]);
+ lstep2[2] = _mm256_unpacklo_epi16(step2[1], step2[6]);
+ lstep2[3] = _mm256_unpackhi_epi16(step2[1], step2[6]);
+ lstep2[4] = _mm256_unpacklo_epi16(step2[2], step2[5]);
+ lstep2[5] = _mm256_unpackhi_epi16(step2[2], step2[5]);
+ lstep2[6] = _mm256_unpacklo_epi16(step2[3], step2[4]);
+ lstep2[7] = _mm256_unpackhi_epi16(step2[3], step2[4]);
+
+ lstep3[0] = _mm256_madd_epi16(lstep2[0], kOne);
+ lstep3[1] = _mm256_madd_epi16(lstep2[1], kOne);
+ lstep3[2] = _mm256_madd_epi16(lstep2[2], kOne);
+ lstep3[3] = _mm256_madd_epi16(lstep2[3], kOne);
+ lstep3[4] = _mm256_madd_epi16(lstep2[4], kOne);
+ lstep3[5] = _mm256_madd_epi16(lstep2[5], kOne);
+ lstep3[6] = _mm256_madd_epi16(lstep2[6], kOne);
+ lstep3[7] = _mm256_madd_epi16(lstep2[7], kOne);
+
+ lstep3[8] = _mm256_madd_epi16(lstep2[6], k__pOne_mOne);
+ lstep3[9] = _mm256_madd_epi16(lstep2[7], k__pOne_mOne);
+ lstep3[10] = _mm256_madd_epi16(lstep2[4], k__pOne_mOne);
+ lstep3[11] = _mm256_madd_epi16(lstep2[5], k__pOne_mOne);
+ lstep3[12] = _mm256_madd_epi16(lstep2[2], k__pOne_mOne);
+ lstep3[13] = _mm256_madd_epi16(lstep2[3], k__pOne_mOne);
+ lstep3[14] = _mm256_madd_epi16(lstep2[0], k__pOne_mOne);
+ lstep3[15] = _mm256_madd_epi16(lstep2[1], k__pOne_mOne);
+ }
+ {
+ const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+ const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+ const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+ const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+ const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s3_10_4 =
+ _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_5 =
+ _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_4 =
+ _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_5 =
+ _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_4 =
+ _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_5 =
+ _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_4 =
+ _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_5 =
+ _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ }
+ {
+ lstep1[32] = _mm256_unpacklo_epi16(step1[16], step2[23]);
+ lstep1[33] = _mm256_unpackhi_epi16(step1[16], step2[23]);
+ lstep1[34] = _mm256_unpacklo_epi16(step1[17], step2[22]);
+ lstep1[35] = _mm256_unpackhi_epi16(step1[17], step2[22]);
+ lstep1[36] = _mm256_unpacklo_epi16(step1[18], step2[21]);
+ lstep1[37] = _mm256_unpackhi_epi16(step1[18], step2[21]);
+ lstep1[38] = _mm256_unpacklo_epi16(step1[19], step2[20]);
+ lstep1[39] = _mm256_unpackhi_epi16(step1[19], step2[20]);
+
+ lstep1[56] = _mm256_unpacklo_epi16(step1[28], step2[27]);
+ lstep1[57] = _mm256_unpackhi_epi16(step1[28], step2[27]);
+ lstep1[58] = _mm256_unpacklo_epi16(step1[29], step2[26]);
+ lstep1[59] = _mm256_unpackhi_epi16(step1[29], step2[26]);
+ lstep1[60] = _mm256_unpacklo_epi16(step1[30], step2[25]);
+ lstep1[61] = _mm256_unpackhi_epi16(step1[30], step2[25]);
+ lstep1[62] = _mm256_unpacklo_epi16(step1[31], step2[24]);
+ lstep1[63] = _mm256_unpackhi_epi16(step1[31], step2[24]);
+
+ lstep3[32] = _mm256_madd_epi16(lstep1[32], kOne);
+ lstep3[33] = _mm256_madd_epi16(lstep1[33], kOne);
+ lstep3[34] = _mm256_madd_epi16(lstep1[34], kOne);
+ lstep3[35] = _mm256_madd_epi16(lstep1[35], kOne);
+ lstep3[36] = _mm256_madd_epi16(lstep1[36], kOne);
+ lstep3[37] = _mm256_madd_epi16(lstep1[37], kOne);
+ lstep3[38] = _mm256_madd_epi16(lstep1[38], kOne);
+ lstep3[39] = _mm256_madd_epi16(lstep1[39], kOne);
+
+ lstep3[40] = _mm256_madd_epi16(lstep1[38], k__pOne_mOne);
+ lstep3[41] = _mm256_madd_epi16(lstep1[39], k__pOne_mOne);
+ lstep3[42] = _mm256_madd_epi16(lstep1[36], k__pOne_mOne);
+ lstep3[43] = _mm256_madd_epi16(lstep1[37], k__pOne_mOne);
+ lstep3[44] = _mm256_madd_epi16(lstep1[34], k__pOne_mOne);
+ lstep3[45] = _mm256_madd_epi16(lstep1[35], k__pOne_mOne);
+ lstep3[46] = _mm256_madd_epi16(lstep1[32], k__pOne_mOne);
+ lstep3[47] = _mm256_madd_epi16(lstep1[33], k__pOne_mOne);
+
+ lstep3[48] = _mm256_madd_epi16(lstep1[62], k__pOne_mOne);
+ lstep3[49] = _mm256_madd_epi16(lstep1[63], k__pOne_mOne);
+ lstep3[50] = _mm256_madd_epi16(lstep1[60], k__pOne_mOne);
+ lstep3[51] = _mm256_madd_epi16(lstep1[61], k__pOne_mOne);
+ lstep3[52] = _mm256_madd_epi16(lstep1[58], k__pOne_mOne);
+ lstep3[53] = _mm256_madd_epi16(lstep1[59], k__pOne_mOne);
+ lstep3[54] = _mm256_madd_epi16(lstep1[56], k__pOne_mOne);
+ lstep3[55] = _mm256_madd_epi16(lstep1[57], k__pOne_mOne);
+
+ lstep3[56] = _mm256_madd_epi16(lstep1[56], kOne);
+ lstep3[57] = _mm256_madd_epi16(lstep1[57], kOne);
+ lstep3[58] = _mm256_madd_epi16(lstep1[58], kOne);
+ lstep3[59] = _mm256_madd_epi16(lstep1[59], kOne);
+ lstep3[60] = _mm256_madd_epi16(lstep1[60], kOne);
+ lstep3[61] = _mm256_madd_epi16(lstep1[61], kOne);
+ lstep3[62] = _mm256_madd_epi16(lstep1[62], kOne);
+ lstep3[63] = _mm256_madd_epi16(lstep1[63], kOne);
+ }
+
+ // stage 4
+ {
+ // expanding to 32-bit length prior to addition operations
+ sign[0] = _mm256_cmpgt_epi16(kZero, step2[8]);
+ sign[1] = _mm256_cmpgt_epi16(kZero, step2[9]);
+ sign[2] = _mm256_cmpgt_epi16(kZero, step2[14]);
+ sign[3] = _mm256_cmpgt_epi16(kZero, step2[15]);
+ lstep2[16] = _mm256_unpacklo_epi16(step2[8], sign[0]);
+ lstep2[17] = _mm256_unpackhi_epi16(step2[8], sign[0]);
+ lstep2[18] = _mm256_unpacklo_epi16(step2[9], sign[1]);
+ lstep2[19] = _mm256_unpackhi_epi16(step2[9], sign[1]);
+ lstep2[28] = _mm256_unpacklo_epi16(step2[14], sign[2]);
+ lstep2[29] = _mm256_unpackhi_epi16(step2[14], sign[2]);
+ lstep2[30] = _mm256_unpacklo_epi16(step2[15], sign[3]);
+ lstep2[31] = _mm256_unpackhi_epi16(step2[15], sign[3]);
+
+ lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]);
+ lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]);
+ lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]);
+ lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]);
+ lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]);
+ lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]);
+ lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]);
+ lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]);
+ lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
+ lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
+ lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
+ lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
+ lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
+ lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
+ lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
+ lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
+ lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
+ lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
+ lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
+ lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
+ lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
+ lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
+ lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
+ lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
+ }
+ {
+ // to be continued...
+ //
+ const __m256i k32_p16_p16 =
+ pair256_set_epi32(cospi_16_64, cospi_16_64);
+ const __m256i k32_p16_m16 =
+ pair256_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+ v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+ v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+ v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+ v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ }
+ {
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m256i k32_m24_m08 =
+ pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
+ u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
+ u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
+ u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
+ u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
+ u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
+ v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+ v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+ v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+ v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+ v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+ v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+ v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+ v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 5
+ {
+ lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]);
+ lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]);
+ lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]);
+ lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]);
+ lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
+ lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
+ lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
+ lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
+ }
+ {
+ const __m256i k32_p16_p16 =
+ pair256_set_epi32(cospi_16_64, cospi_16_64);
+ const __m256i k32_p16_m16 =
+ pair256_set_epi32(cospi_16_64, -cospi_16_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+ // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+ v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+ v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+ v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+ v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+ v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+ v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+ v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+ v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+ v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+
+ u[0] = _mm256_sub_epi32(u[0], sign[0]);
+ u[1] = _mm256_sub_epi32(u[1], sign[1]);
+ u[2] = _mm256_sub_epi32(u[2], sign[2]);
+ u[3] = _mm256_sub_epi32(u[3], sign[3]);
+ u[4] = _mm256_sub_epi32(u[4], sign[4]);
+ u[5] = _mm256_sub_epi32(u[5], sign[5]);
+ u[6] = _mm256_sub_epi32(u[6], sign[6]);
+ u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm256_add_epi32(u[0], K32One);
+ u[1] = _mm256_add_epi32(u[1], K32One);
+ u[2] = _mm256_add_epi32(u[2], K32One);
+ u[3] = _mm256_add_epi32(u[3], K32One);
+ u[4] = _mm256_add_epi32(u[4], K32One);
+ u[5] = _mm256_add_epi32(u[5], K32One);
+ u[6] = _mm256_add_epi32(u[6], K32One);
+ u[7] = _mm256_add_epi32(u[7], K32One);
+
+ u[0] = _mm256_srai_epi32(u[0], 2);
+ u[1] = _mm256_srai_epi32(u[1], 2);
+ u[2] = _mm256_srai_epi32(u[2], 2);
+ u[3] = _mm256_srai_epi32(u[3], 2);
+ u[4] = _mm256_srai_epi32(u[4], 2);
+ u[5] = _mm256_srai_epi32(u[5], 2);
+ u[6] = _mm256_srai_epi32(u[6], 2);
+ u[7] = _mm256_srai_epi32(u[7], 2);
+
+ // Combine
+ out[0] = _mm256_packs_epi32(u[0], u[1]);
+ out[16] = _mm256_packs_epi32(u[2], u[3]);
+ out[8] = _mm256_packs_epi32(u[4], u[5]);
+ out[24] = _mm256_packs_epi32(u[6], u[7]);
+ }
+ {
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m256i k32_m24_m08 =
+ pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
+ v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+ v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+ v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+ v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+ v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
+ lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
+ lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
+ lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
+ lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
+ lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
+ lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
+ lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
+ }
+ {
+ lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
+ lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
+ lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
+ lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
+ lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
+ lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
+ lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
+ lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
+ lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
+ lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
+ lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
+ lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
+ lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
+ lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
+ lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
+ lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
+ lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
+ lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
+ lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
+ lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
+ lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
+ lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
+ lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
+ lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
+ lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
+ lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
+ lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
+ lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
+ lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
+ lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
+ lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
+ lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
+ }
+ // stage 6
+ {
+ const __m256i k32_p28_p04 =
+ pair256_set_epi32(cospi_28_64, cospi_4_64);
+ const __m256i k32_p12_p20 =
+ pair256_set_epi32(cospi_12_64, cospi_20_64);
+ const __m256i k32_m20_p12 =
+ pair256_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m256i k32_m04_p28 =
+ pair256_set_epi32(-cospi_4_64, cospi_28_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
+ u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+
+ u[0] = _mm256_sub_epi32(u[0], sign[0]);
+ u[1] = _mm256_sub_epi32(u[1], sign[1]);
+ u[2] = _mm256_sub_epi32(u[2], sign[2]);
+ u[3] = _mm256_sub_epi32(u[3], sign[3]);
+ u[4] = _mm256_sub_epi32(u[4], sign[4]);
+ u[5] = _mm256_sub_epi32(u[5], sign[5]);
+ u[6] = _mm256_sub_epi32(u[6], sign[6]);
+ u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm256_add_epi32(u[0], K32One);
+ u[1] = _mm256_add_epi32(u[1], K32One);
+ u[2] = _mm256_add_epi32(u[2], K32One);
+ u[3] = _mm256_add_epi32(u[3], K32One);
+ u[4] = _mm256_add_epi32(u[4], K32One);
+ u[5] = _mm256_add_epi32(u[5], K32One);
+ u[6] = _mm256_add_epi32(u[6], K32One);
+ u[7] = _mm256_add_epi32(u[7], K32One);
+
+ u[0] = _mm256_srai_epi32(u[0], 2);
+ u[1] = _mm256_srai_epi32(u[1], 2);
+ u[2] = _mm256_srai_epi32(u[2], 2);
+ u[3] = _mm256_srai_epi32(u[3], 2);
+ u[4] = _mm256_srai_epi32(u[4], 2);
+ u[5] = _mm256_srai_epi32(u[5], 2);
+ u[6] = _mm256_srai_epi32(u[6], 2);
+ u[7] = _mm256_srai_epi32(u[7], 2);
+
+ out[4] = _mm256_packs_epi32(u[0], u[1]);
+ out[20] = _mm256_packs_epi32(u[2], u[3]);
+ out[12] = _mm256_packs_epi32(u[4], u[5]);
+ out[28] = _mm256_packs_epi32(u[6], u[7]);
+ }
+ {
+ lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
+ lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
+ lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
+ lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
+ lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
+ lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
+ lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
+ lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
+ lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
+ lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
+ lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
+ lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
+ lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
+ lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
+ lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
+ lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
+ }
+ {
+ const __m256i k32_m04_p28 =
+ pair256_set_epi32(-cospi_4_64, cospi_28_64);
+ const __m256i k32_m28_m04 =
+ pair256_set_epi32(-cospi_28_64, -cospi_4_64);
+ const __m256i k32_m20_p12 =
+ pair256_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m256i k32_m12_m20 =
+ pair256_set_epi32(-cospi_12_64, -cospi_20_64);
+ const __m256i k32_p12_p20 =
+ pair256_set_epi32(cospi_12_64, cospi_20_64);
+ const __m256i k32_p28_p04 =
+ pair256_set_epi32(cospi_28_64, cospi_4_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
+ u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
+ u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
+ u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
+ u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
+ u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
+ u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
+ u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
+ u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
+ u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
+ u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
+ u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
+ u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
+ u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
+ u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
+ u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
+ v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20);
+ v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20);
+ v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
+ v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28);
+ v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+ v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+ v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+ v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 7
+ {
+ const __m256i k32_p30_p02 =
+ pair256_set_epi32(cospi_30_64, cospi_2_64);
+ const __m256i k32_p14_p18 =
+ pair256_set_epi32(cospi_14_64, cospi_18_64);
+ const __m256i k32_p22_p10 =
+ pair256_set_epi32(cospi_22_64, cospi_10_64);
+ const __m256i k32_p06_p26 =
+ pair256_set_epi32(cospi_6_64, cospi_26_64);
+ const __m256i k32_m26_p06 =
+ pair256_set_epi32(-cospi_26_64, cospi_6_64);
+ const __m256i k32_m10_p22 =
+ pair256_set_epi32(-cospi_10_64, cospi_22_64);
+ const __m256i k32_m18_p14 =
+ pair256_set_epi32(-cospi_18_64, cospi_14_64);
+ const __m256i k32_m02_p30 =
+ pair256_set_epi32(-cospi_2_64, cospi_30_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
+ u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
+ u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
+ u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
+ u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
+ u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
+ u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
+ u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
+ u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
+ u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
+ u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
+ u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
+ u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[2] = _mm256_packs_epi32(u[0], u[1]);
+ out[18] = _mm256_packs_epi32(u[2], u[3]);
+ out[10] = _mm256_packs_epi32(u[4], u[5]);
+ out[26] = _mm256_packs_epi32(u[6], u[7]);
+ out[6] = _mm256_packs_epi32(u[8], u[9]);
+ out[22] = _mm256_packs_epi32(u[10], u[11]);
+ out[14] = _mm256_packs_epi32(u[12], u[13]);
+ out[30] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ {
+ lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
+ lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
+ lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
+ lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
+ lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
+ lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
+ lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
+ lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
+ lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
+ lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
+ lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
+ lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
+ lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
+ lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
+ lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
+ lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
+ lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
+ lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
+ lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
+ lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
+ lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
+ lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
+ lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
+ lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
+ lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
+ lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
+ lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
+ lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
+ lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
+ lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
+ lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
+ lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
+ }
+ // stage 8
+ {
+ const __m256i k32_p31_p01 =
+ pair256_set_epi32(cospi_31_64, cospi_1_64);
+ const __m256i k32_p15_p17 =
+ pair256_set_epi32(cospi_15_64, cospi_17_64);
+ const __m256i k32_p23_p09 =
+ pair256_set_epi32(cospi_23_64, cospi_9_64);
+ const __m256i k32_p07_p25 =
+ pair256_set_epi32(cospi_7_64, cospi_25_64);
+ const __m256i k32_m25_p07 =
+ pair256_set_epi32(-cospi_25_64, cospi_7_64);
+ const __m256i k32_m09_p23 =
+ pair256_set_epi32(-cospi_9_64, cospi_23_64);
+ const __m256i k32_m17_p15 =
+ pair256_set_epi32(-cospi_17_64, cospi_15_64);
+ const __m256i k32_m01_p31 =
+ pair256_set_epi32(-cospi_1_64, cospi_31_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
+ u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
+ u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
+ u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
+ u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
+ u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
+ u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
+ u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
+ u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[1] = _mm256_packs_epi32(u[0], u[1]);
+ out[17] = _mm256_packs_epi32(u[2], u[3]);
+ out[9] = _mm256_packs_epi32(u[4], u[5]);
+ out[25] = _mm256_packs_epi32(u[6], u[7]);
+ out[7] = _mm256_packs_epi32(u[8], u[9]);
+ out[23] = _mm256_packs_epi32(u[10], u[11]);
+ out[15] = _mm256_packs_epi32(u[12], u[13]);
+ out[31] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ {
+ const __m256i k32_p27_p05 =
+ pair256_set_epi32(cospi_27_64, cospi_5_64);
+ const __m256i k32_p11_p21 =
+ pair256_set_epi32(cospi_11_64, cospi_21_64);
+ const __m256i k32_p19_p13 =
+ pair256_set_epi32(cospi_19_64, cospi_13_64);
+ const __m256i k32_p03_p29 =
+ pair256_set_epi32(cospi_3_64, cospi_29_64);
+ const __m256i k32_m29_p03 =
+ pair256_set_epi32(-cospi_29_64, cospi_3_64);
+ const __m256i k32_m13_p19 =
+ pair256_set_epi32(-cospi_13_64, cospi_19_64);
+ const __m256i k32_m21_p11 =
+ pair256_set_epi32(-cospi_21_64, cospi_11_64);
+ const __m256i k32_m05_p27 =
+ pair256_set_epi32(-cospi_5_64, cospi_27_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
+ u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
+ u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
+ u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
+ u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
+ u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
+ u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
+ u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
+ u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[5] = _mm256_packs_epi32(u[0], u[1]);
+ out[21] = _mm256_packs_epi32(u[2], u[3]);
+ out[13] = _mm256_packs_epi32(u[4], u[5]);
+ out[29] = _mm256_packs_epi32(u[6], u[7]);
+ out[3] = _mm256_packs_epi32(u[8], u[9]);
+ out[19] = _mm256_packs_epi32(u[10], u[11]);
+ out[11] = _mm256_packs_epi32(u[12], u[13]);
+ out[27] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ }
+#endif
+ // Transpose the results, do it as four 8x8 transposes.
+ {
+ int transpose_block;
+ int16_t *output_currStep, *output_nextStep;
+ if (0 == pass) {
+ output_currStep = &intermediate[column_start * 32];
+ output_nextStep = &intermediate[(column_start + 8) * 32];
+ } else {
+ output_currStep = &output_org[column_start * 32];
+ output_nextStep = &output_org[(column_start + 8) * 32];
+ }
+ for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+ __m256i *this_out = &out[8 * transpose_block];
+ // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
+ // 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
+ // 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
+ // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
+ // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
+ // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
+ // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
+ const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
+ const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
+ const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
+ const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
+ const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
+ const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
+ const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
+ const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
+ // 00 20 01 21 02 22 03 23 08 28 09 29 10 30 11 31
+ // 40 60 41 61 42 62 43 63 48 68 49 69 50 70 51 71
+ // 04 24 05 25 06 26 07 27 12 32 13 33 14 34 15 35
+ // 44 64 45 65 46 66 47 67 52 72 53 73 54 74 55 75
+ // 80 100 81 101 82 102 83 103 88 108 89 109 90 110 91 101
+ // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
+ // 84 104 85 105 86 106 87 107 92 112 93 113 94 114 95 115
+ // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
+
+ const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+ const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+ const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+ const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+ const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+ const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+ const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+ const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 20 40 60 01 21 41 61 08 28 48 68 09 29 49 69
+ // 04 24 44 64 05 25 45 65 12 32 52 72 13 33 53 73
+ // 02 22 42 62 03 23 43 63 10 30 50 70 11 31 51 71
+ // 06 26 46 66 07 27 47 67 14 34 54 74 15 35 55 75
+ // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
+ // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
+ // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
+ // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
+ __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+ __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+ __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+ __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+ __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+ __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+ __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+ __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
+ // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
+ // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
+ // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
+ // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
+ // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
+ // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
+ // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
+ if (0 == pass) {
+ // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+ // TODO(cd): see quality impact of only doing
+ // output[j] = (output[j] + 1) >> 2;
+ // which would remove the code between here ...
+ __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
+ __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
+ __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
+ __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
+ __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
+ __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
+ __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
+ __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
+ tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
+ tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
+ tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
+ tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
+ tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
+ tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
+ tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
+ tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
+ // ... and here.
+ // PS: also change code in vp9/encoder/vp9_dct.c
+ tr2_0 = _mm256_add_epi16(tr2_0, kOne);
+ tr2_1 = _mm256_add_epi16(tr2_1, kOne);
+ tr2_2 = _mm256_add_epi16(tr2_2, kOne);
+ tr2_3 = _mm256_add_epi16(tr2_3, kOne);
+ tr2_4 = _mm256_add_epi16(tr2_4, kOne);
+ tr2_5 = _mm256_add_epi16(tr2_5, kOne);
+ tr2_6 = _mm256_add_epi16(tr2_6, kOne);
+ tr2_7 = _mm256_add_epi16(tr2_7, kOne);
+ tr2_0 = _mm256_srai_epi16(tr2_0, 2);
+ tr2_1 = _mm256_srai_epi16(tr2_1, 2);
+ tr2_2 = _mm256_srai_epi16(tr2_2, 2);
+ tr2_3 = _mm256_srai_epi16(tr2_3, 2);
+ tr2_4 = _mm256_srai_epi16(tr2_4, 2);
+ tr2_5 = _mm256_srai_epi16(tr2_5, 2);
+ tr2_6 = _mm256_srai_epi16(tr2_6, 2);
+ tr2_7 = _mm256_srai_epi16(tr2_7, 2);
+ }
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
+ _mm256_castsi256_si128(tr2_0));
+ _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
+ _mm256_castsi256_si128(tr2_1));
+ _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
+ _mm256_castsi256_si128(tr2_2));
+ _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
+ _mm256_castsi256_si128(tr2_3));
+ _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
+ _mm256_castsi256_si128(tr2_4));
+ _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
+ _mm256_castsi256_si128(tr2_5));
+ _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
+ _mm256_castsi256_si128(tr2_6));
+ _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
+ _mm256_castsi256_si128(tr2_7));
+
+ _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
+ _mm256_extractf128_si256(tr2_0, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
+ _mm256_extractf128_si256(tr2_1, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
+ _mm256_extractf128_si256(tr2_2, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
+ _mm256_extractf128_si256(tr2_3, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
+ _mm256_extractf128_si256(tr2_4, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
+ _mm256_extractf128_si256(tr2_5, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
+ _mm256_extractf128_si256(tr2_6, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
+ _mm256_extractf128_si256(tr2_7, 1));
+ // Process next 8x8
+ output_currStep += 8;
+ output_nextStep += 8;
+ }
+ }
+ }
+ }
+} // NOLINT
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
new file mode 100644
index 0000000000..bf350b6da0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -0,0 +1,3130 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "vpx_dsp/fwd_txfm.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+// TODO(jingning) The high bit-depth version needs re-work for performance.
+// The current SSE2 implementation also causes cross reference to the static
+// functions in the C implementation file.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+#if FDCT32x32_HIGH_PRECISION
+static void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
+ vpx_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ }
+}
+#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
+#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
+#else
+static void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate,
+ tran_low_t *out) {
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
+ vpx_fdct32(temp_in, temp_out, 1);
+ for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
+ }
+}
+#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c
+#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c
+#endif // FDCT32x32_HIGH_PRECISION
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif // DCT_HIGH_BIT_DEPTH
+
+void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
+ // Calculate pre-multiplied strides
+ const int str1 = stride;
+ const int str2 = 2 * stride;
+ const int str3 = 2 * stride + str1;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_setzero_si128();
+ const __m128i kOne = _mm_set1_epi16(1);
+
+ // Do the two transform/transpose passes
+ int pass;
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 32; column_start += 8) {
+ __m128i step1[32];
+ __m128i step2[32];
+ __m128i step3[32];
+ __m128i out[32];
+ // Stage 1
+ // Note: even though all the loads below are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (0 == pass) {
+ const int16_t *in = &input[column_start];
+ // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
+ __m128i *step1a = &step1[0];
+ __m128i *step1b = &step1[31];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
+ __m128i *step1a = &step1[4];
+ __m128i *step1b = &step1[27];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
+ __m128i *step1a = &step1[8];
+ __m128i *step1b = &step1[23];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
+ __m128i *step1a = &step1[12];
+ __m128i *step1b = &step1[19];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ } else {
+ int16_t *in = &intermediate[column_start];
+ // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
+ // Note: using the same approach as above to have common offset is
+ // counter-productive as all offsets can be calculated at compile
+ // time.
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
+ __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
+ __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
+ __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
+ __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+ __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+ __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+ __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+ step1[0] = ADD_EPI16(in00, in31);
+ step1[1] = ADD_EPI16(in01, in30);
+ step1[2] = ADD_EPI16(in02, in29);
+ step1[3] = ADD_EPI16(in03, in28);
+ step1[28] = SUB_EPI16(in03, in28);
+ step1[29] = SUB_EPI16(in02, in29);
+ step1[30] = SUB_EPI16(in01, in30);
+ step1[31] = SUB_EPI16(in00, in31);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
+ &step1[3], &step1[28], &step1[29],
+ &step1[30], &step1[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
+ __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
+ __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
+ __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
+ __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+ __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+ __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+ __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+ step1[4] = ADD_EPI16(in04, in27);
+ step1[5] = ADD_EPI16(in05, in26);
+ step1[6] = ADD_EPI16(in06, in25);
+ step1[7] = ADD_EPI16(in07, in24);
+ step1[24] = SUB_EPI16(in07, in24);
+ step1[25] = SUB_EPI16(in06, in25);
+ step1[26] = SUB_EPI16(in05, in26);
+ step1[27] = SUB_EPI16(in04, in27);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
+ &step1[7], &step1[24], &step1[25],
+ &step1[26], &step1[27]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
+ __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
+ __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+ __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+ __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+ __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+ __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+ __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+ step1[8] = ADD_EPI16(in08, in23);
+ step1[9] = ADD_EPI16(in09, in22);
+ step1[10] = ADD_EPI16(in10, in21);
+ step1[11] = ADD_EPI16(in11, in20);
+ step1[20] = SUB_EPI16(in11, in20);
+ step1[21] = SUB_EPI16(in10, in21);
+ step1[22] = SUB_EPI16(in09, in22);
+ step1[23] = SUB_EPI16(in08, in23);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
+ &step1[11], &step1[20], &step1[21],
+ &step1[22], &step1[23]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+ __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+ __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+ __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+ __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+ __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+ __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+ __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+ step1[12] = ADD_EPI16(in12, in19);
+ step1[13] = ADD_EPI16(in13, in18);
+ step1[14] = ADD_EPI16(in14, in17);
+ step1[15] = ADD_EPI16(in15, in16);
+ step1[16] = SUB_EPI16(in15, in16);
+ step1[17] = SUB_EPI16(in14, in17);
+ step1[18] = SUB_EPI16(in13, in18);
+ step1[19] = SUB_EPI16(in12, in19);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
+ &step1[15], &step1[16], &step1[17],
+ &step1[18], &step1[19]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Stage 2
+ {
+ step2[0] = ADD_EPI16(step1[0], step1[15]);
+ step2[1] = ADD_EPI16(step1[1], step1[14]);
+ step2[2] = ADD_EPI16(step1[2], step1[13]);
+ step2[3] = ADD_EPI16(step1[3], step1[12]);
+ step2[4] = ADD_EPI16(step1[4], step1[11]);
+ step2[5] = ADD_EPI16(step1[5], step1[10]);
+ step2[6] = ADD_EPI16(step1[6], step1[9]);
+ step2[7] = ADD_EPI16(step1[7], step1[8]);
+ step2[8] = SUB_EPI16(step1[7], step1[8]);
+ step2[9] = SUB_EPI16(step1[6], step1[9]);
+ step2[10] = SUB_EPI16(step1[5], step1[10]);
+ step2[11] = SUB_EPI16(step1[4], step1[11]);
+ step2[12] = SUB_EPI16(step1[3], step1[12]);
+ step2[13] = SUB_EPI16(step1[2], step1[13]);
+ step2[14] = SUB_EPI16(step1[1], step1[14]);
+ step2[15] = SUB_EPI16(step1[0], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
+ &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[12], &step2[13], &step2[14], &step2[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+ const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+ const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+ const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+ const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+ const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+ const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+ const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+ const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
+ &step2[23], &step2[24], &step2[25],
+ &step2[26], &step2[27]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+
+#if !FDCT32x32_HIGH_PRECISION
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
+ __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
+ __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
+ __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
+ __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
+ __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
+ __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
+ __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
+ __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
+ __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
+ __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
+ __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
+ __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
+ __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
+ __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+ __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+ __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
+ __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
+ __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
+ __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
+ __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
+ __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
+ __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
+ __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
+ __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
+ __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
+ __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
+ __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
+ __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
+ __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
+ __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
+ __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
+
+ step2[0] = SUB_EPI16(step2[0], s3_00_0);
+ step2[1] = SUB_EPI16(step2[1], s3_01_0);
+ step2[2] = SUB_EPI16(step2[2], s3_02_0);
+ step2[3] = SUB_EPI16(step2[3], s3_03_0);
+ step2[4] = SUB_EPI16(step2[4], s3_04_0);
+ step2[5] = SUB_EPI16(step2[5], s3_05_0);
+ step2[6] = SUB_EPI16(step2[6], s3_06_0);
+ step2[7] = SUB_EPI16(step2[7], s3_07_0);
+ step2[8] = SUB_EPI16(step2[8], s2_08_0);
+ step2[9] = SUB_EPI16(step2[9], s2_09_0);
+ step2[10] = SUB_EPI16(step2[10], s3_10_0);
+ step2[11] = SUB_EPI16(step2[11], s3_11_0);
+ step2[12] = SUB_EPI16(step2[12], s3_12_0);
+ step2[13] = SUB_EPI16(step2[13], s3_13_0);
+ step2[14] = SUB_EPI16(step2[14], s2_14_0);
+ step2[15] = SUB_EPI16(step2[15], s2_15_0);
+ step1[16] = SUB_EPI16(step1[16], s3_16_0);
+ step1[17] = SUB_EPI16(step1[17], s3_17_0);
+ step1[18] = SUB_EPI16(step1[18], s3_18_0);
+ step1[19] = SUB_EPI16(step1[19], s3_19_0);
+ step2[20] = SUB_EPI16(step2[20], s3_20_0);
+ step2[21] = SUB_EPI16(step2[21], s3_21_0);
+ step2[22] = SUB_EPI16(step2[22], s3_22_0);
+ step2[23] = SUB_EPI16(step2[23], s3_23_0);
+ step2[24] = SUB_EPI16(step2[24], s3_24_0);
+ step2[25] = SUB_EPI16(step2[25], s3_25_0);
+ step2[26] = SUB_EPI16(step2[26], s3_26_0);
+ step2[27] = SUB_EPI16(step2[27], s3_27_0);
+ step1[28] = SUB_EPI16(step1[28], s3_28_0);
+ step1[29] = SUB_EPI16(step1[29], s3_29_0);
+ step1[30] = SUB_EPI16(step1[30], s3_30_0);
+ step1[31] = SUB_EPI16(step1[31], s3_31_0);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x32(
+ &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
+ &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
+ &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
+ &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
+ &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ step2[0] = _mm_add_epi16(step2[0], kOne);
+ step2[1] = _mm_add_epi16(step2[1], kOne);
+ step2[2] = _mm_add_epi16(step2[2], kOne);
+ step2[3] = _mm_add_epi16(step2[3], kOne);
+ step2[4] = _mm_add_epi16(step2[4], kOne);
+ step2[5] = _mm_add_epi16(step2[5], kOne);
+ step2[6] = _mm_add_epi16(step2[6], kOne);
+ step2[7] = _mm_add_epi16(step2[7], kOne);
+ step2[8] = _mm_add_epi16(step2[8], kOne);
+ step2[9] = _mm_add_epi16(step2[9], kOne);
+ step2[10] = _mm_add_epi16(step2[10], kOne);
+ step2[11] = _mm_add_epi16(step2[11], kOne);
+ step2[12] = _mm_add_epi16(step2[12], kOne);
+ step2[13] = _mm_add_epi16(step2[13], kOne);
+ step2[14] = _mm_add_epi16(step2[14], kOne);
+ step2[15] = _mm_add_epi16(step2[15], kOne);
+ step1[16] = _mm_add_epi16(step1[16], kOne);
+ step1[17] = _mm_add_epi16(step1[17], kOne);
+ step1[18] = _mm_add_epi16(step1[18], kOne);
+ step1[19] = _mm_add_epi16(step1[19], kOne);
+ step2[20] = _mm_add_epi16(step2[20], kOne);
+ step2[21] = _mm_add_epi16(step2[21], kOne);
+ step2[22] = _mm_add_epi16(step2[22], kOne);
+ step2[23] = _mm_add_epi16(step2[23], kOne);
+ step2[24] = _mm_add_epi16(step2[24], kOne);
+ step2[25] = _mm_add_epi16(step2[25], kOne);
+ step2[26] = _mm_add_epi16(step2[26], kOne);
+ step2[27] = _mm_add_epi16(step2[27], kOne);
+ step1[28] = _mm_add_epi16(step1[28], kOne);
+ step1[29] = _mm_add_epi16(step1[29], kOne);
+ step1[30] = _mm_add_epi16(step1[30], kOne);
+ step1[31] = _mm_add_epi16(step1[31], kOne);
+
+ step2[0] = _mm_srai_epi16(step2[0], 2);
+ step2[1] = _mm_srai_epi16(step2[1], 2);
+ step2[2] = _mm_srai_epi16(step2[2], 2);
+ step2[3] = _mm_srai_epi16(step2[3], 2);
+ step2[4] = _mm_srai_epi16(step2[4], 2);
+ step2[5] = _mm_srai_epi16(step2[5], 2);
+ step2[6] = _mm_srai_epi16(step2[6], 2);
+ step2[7] = _mm_srai_epi16(step2[7], 2);
+ step2[8] = _mm_srai_epi16(step2[8], 2);
+ step2[9] = _mm_srai_epi16(step2[9], 2);
+ step2[10] = _mm_srai_epi16(step2[10], 2);
+ step2[11] = _mm_srai_epi16(step2[11], 2);
+ step2[12] = _mm_srai_epi16(step2[12], 2);
+ step2[13] = _mm_srai_epi16(step2[13], 2);
+ step2[14] = _mm_srai_epi16(step2[14], 2);
+ step2[15] = _mm_srai_epi16(step2[15], 2);
+ step1[16] = _mm_srai_epi16(step1[16], 2);
+ step1[17] = _mm_srai_epi16(step1[17], 2);
+ step1[18] = _mm_srai_epi16(step1[18], 2);
+ step1[19] = _mm_srai_epi16(step1[19], 2);
+ step2[20] = _mm_srai_epi16(step2[20], 2);
+ step2[21] = _mm_srai_epi16(step2[21], 2);
+ step2[22] = _mm_srai_epi16(step2[22], 2);
+ step2[23] = _mm_srai_epi16(step2[23], 2);
+ step2[24] = _mm_srai_epi16(step2[24], 2);
+ step2[25] = _mm_srai_epi16(step2[25], 2);
+ step2[26] = _mm_srai_epi16(step2[26], 2);
+ step2[27] = _mm_srai_epi16(step2[27], 2);
+ step1[28] = _mm_srai_epi16(step1[28], 2);
+ step1[29] = _mm_srai_epi16(step1[29], 2);
+ step1[30] = _mm_srai_epi16(step1[30], 2);
+ step1[31] = _mm_srai_epi16(step1[31], 2);
+ }
+#endif // !FDCT32x32_HIGH_PRECISION
+
+#if FDCT32x32_HIGH_PRECISION
+ if (pass == 0) {
+#endif
+ // Stage 3
+ {
+ step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
+ step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
+ step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
+ step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
+ step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
+ step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
+ step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
+ step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+ &step3[3], &step3[4], &step3[5],
+ &step3[6], &step3[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
+ &step3[13]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step3[16] = ADD_EPI16(step2[23], step1[16]);
+ step3[17] = ADD_EPI16(step2[22], step1[17]);
+ step3[18] = ADD_EPI16(step2[21], step1[18]);
+ step3[19] = ADD_EPI16(step2[20], step1[19]);
+ step3[20] = SUB_EPI16(step1[19], step2[20]);
+ step3[21] = SUB_EPI16(step1[18], step2[21]);
+ step3[22] = SUB_EPI16(step1[17], step2[22]);
+ step3[23] = SUB_EPI16(step1[16], step2[23]);
+ step3[24] = SUB_EPI16(step1[31], step2[24]);
+ step3[25] = SUB_EPI16(step1[30], step2[25]);
+ step3[26] = SUB_EPI16(step1[29], step2[26]);
+ step3[27] = SUB_EPI16(step1[28], step2[27]);
+ step3[28] = ADD_EPI16(step2[27], step1[28]);
+ step3[29] = ADD_EPI16(step2[26], step1[29]);
+ step3[30] = ADD_EPI16(step2[25], step1[30]);
+ step3[31] = ADD_EPI16(step2[24], step1[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
+ &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
+ &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
+ &step3[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+
+ // Stage 4
+ {
+ step1[0] = ADD_EPI16(step3[3], step3[0]);
+ step1[1] = ADD_EPI16(step3[2], step3[1]);
+ step1[2] = SUB_EPI16(step3[1], step3[2]);
+ step1[3] = SUB_EPI16(step3[0], step3[3]);
+ step1[8] = ADD_EPI16(step3[11], step2[8]);
+ step1[9] = ADD_EPI16(step3[10], step2[9]);
+ step1[10] = SUB_EPI16(step2[9], step3[10]);
+ step1[11] = SUB_EPI16(step2[8], step3[11]);
+ step1[12] = SUB_EPI16(step2[15], step3[12]);
+ step1[13] = SUB_EPI16(step2[14], step3[13]);
+ step1[14] = ADD_EPI16(step3[13], step2[14]);
+ step1[15] = ADD_EPI16(step3[12], step2[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
+ &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
+ &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+ const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+ const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+ const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+ const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+ const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+ const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+ const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+ const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+ const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+ const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+ &step1[21], &step1[26], &step1[27],
+ &step1[28], &step1[29]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 5
+ {
+ step2[4] = ADD_EPI16(step1[5], step3[4]);
+ step2[5] = SUB_EPI16(step3[4], step1[5]);
+ step2[6] = SUB_EPI16(step3[7], step1[6]);
+ step2[7] = ADD_EPI16(step1[6], step3[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
+ &step2[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+ const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+ const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+ const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+ const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i out_00_4 =
+ _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_5 =
+ _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_4 =
+ _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_5 =
+ _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_4 =
+ _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_5 =
+ _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_4 =
+ _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_5 =
+ _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[0] = _mm_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+ out[8] = _mm_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
+ const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
+ const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+ const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
+ &step2[14]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step2[16] = ADD_EPI16(step1[19], step3[16]);
+ step2[17] = ADD_EPI16(step1[18], step3[17]);
+ step2[18] = SUB_EPI16(step3[17], step1[18]);
+ step2[19] = SUB_EPI16(step3[16], step1[19]);
+ step2[20] = SUB_EPI16(step3[23], step1[20]);
+ step2[21] = SUB_EPI16(step3[22], step1[21]);
+ step2[22] = ADD_EPI16(step1[21], step3[22]);
+ step2[23] = ADD_EPI16(step1[20], step3[23]);
+ step2[24] = ADD_EPI16(step1[27], step3[24]);
+ step2[25] = ADD_EPI16(step1[26], step3[25]);
+ step2[26] = SUB_EPI16(step3[25], step1[26]);
+ step2[27] = SUB_EPI16(step3[24], step1[27]);
+ step2[28] = SUB_EPI16(step3[31], step1[28]);
+ step2[29] = SUB_EPI16(step3[30], step1[29]);
+ step2[30] = ADD_EPI16(step1[29], step3[30]);
+ step2[31] = ADD_EPI16(step1[28], step3[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
+ &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
+ &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
+ &step2[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 6
+ {
+ const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m128i out_04_4 =
+ _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_5 =
+ _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_4 =
+ _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_5 =
+ _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_4 =
+ _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_5 =
+ _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_4 =
+ _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_5 =
+ _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[4] = _mm_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step3[8] = ADD_EPI16(step2[9], step1[8]);
+ step3[9] = SUB_EPI16(step1[8], step2[9]);
+ step3[10] = SUB_EPI16(step1[11], step2[10]);
+ step3[11] = ADD_EPI16(step2[10], step1[11]);
+ step3[12] = ADD_EPI16(step2[13], step1[12]);
+ step3[13] = SUB_EPI16(step1[12], step2[13]);
+ step3[14] = SUB_EPI16(step1[15], step2[14]);
+ step3[15] = ADD_EPI16(step2[14], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+ &step3[11], &step3[12], &step3[13],
+ &step3[14], &step3[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+ const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+ const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+ const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+ const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+ const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+ const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+ const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+ const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+ &step3[22], &step3[25], &step3[26],
+ &step3[29], &step3[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 7
+ {
+ const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
+ const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
+ const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
+ const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
+ const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+ const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+ const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+ const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+ const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m128i out_02_4 =
+ _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_5 =
+ _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_4 =
+ _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_5 =
+ _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_4 =
+ _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_5 =
+ _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_4 =
+ _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_5 =
+ _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_4 =
+ _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_5 =
+ _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_4 =
+ _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_5 =
+ _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_4 =
+ _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_5 =
+ _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_4 =
+ _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_5 =
+ _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[2] = _mm_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+ out[6] = _mm_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
+ &out[6], &out[22], &out[14], &out[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step1[16] = ADD_EPI16(step3[17], step2[16]);
+ step1[17] = SUB_EPI16(step2[16], step3[17]);
+ step1[18] = SUB_EPI16(step2[19], step3[18]);
+ step1[19] = ADD_EPI16(step3[18], step2[19]);
+ step1[20] = ADD_EPI16(step3[21], step2[20]);
+ step1[21] = SUB_EPI16(step2[20], step3[21]);
+ step1[22] = SUB_EPI16(step2[23], step3[22]);
+ step1[23] = ADD_EPI16(step3[22], step2[23]);
+ step1[24] = ADD_EPI16(step3[25], step2[24]);
+ step1[25] = SUB_EPI16(step2[24], step3[25]);
+ step1[26] = SUB_EPI16(step2[27], step3[26]);
+ step1[27] = ADD_EPI16(step3[26], step2[27]);
+ step1[28] = ADD_EPI16(step3[29], step2[28]);
+ step1[29] = SUB_EPI16(step2[28], step3[29]);
+ step1[30] = SUB_EPI16(step2[31], step3[30]);
+ step1[31] = ADD_EPI16(step3[30], step2[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
+ &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
+ &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
+ &step1[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+ const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+ const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+ const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+ const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+ const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+ const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+ const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+ const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m128i out_01_4 =
+ _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_5 =
+ _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_4 =
+ _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_5 =
+ _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_4 =
+ _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_5 =
+ _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_4 =
+ _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_5 =
+ _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_4 =
+ _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_5 =
+ _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_4 =
+ _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_5 =
+ _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_4 =
+ _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_5 =
+ _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_4 =
+ _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_5 =
+ _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[1] = _mm_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+ out[9] = _mm_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+ out[7] = _mm_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
+ &out[7], &out[23], &out[15], &out[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+ const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+ const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+ const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+ const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+ const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+ const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+ const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+ const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m128i out_05_4 =
+ _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_5 =
+ _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_4 =
+ _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_5 =
+ _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_4 =
+ _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_5 =
+ _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_4 =
+ _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_5 =
+ _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_4 =
+ _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_5 =
+ _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_4 =
+ _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_5 =
+ _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_4 =
+ _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_5 =
+ _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_4 =
+ _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_5 =
+ _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[5] = _mm_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+ out[3] = _mm_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
+ &out[3], &out[19], &out[11], &out[27]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+#if FDCT32x32_HIGH_PRECISION
+ } else {
+ __m128i lstep1[64], lstep2[64], lstep3[64];
+ __m128i u[32], v[32], sign[16];
+ const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
+ const __m128i k__pOne_mOne = pair_set_epi16(1, -1);
+ // start using 32-bit operations
+ // stage 3
+ {
+ // expanding to 32-bit length while adding and subtracting
+ lstep2[0] = _mm_unpacklo_epi16(step2[0], step2[7]);
+ lstep2[1] = _mm_unpackhi_epi16(step2[0], step2[7]);
+ lstep2[2] = _mm_unpacklo_epi16(step2[1], step2[6]);
+ lstep2[3] = _mm_unpackhi_epi16(step2[1], step2[6]);
+ lstep2[4] = _mm_unpacklo_epi16(step2[2], step2[5]);
+ lstep2[5] = _mm_unpackhi_epi16(step2[2], step2[5]);
+ lstep2[6] = _mm_unpacklo_epi16(step2[3], step2[4]);
+ lstep2[7] = _mm_unpackhi_epi16(step2[3], step2[4]);
+
+ lstep3[0] = _mm_madd_epi16(lstep2[0], kOne);
+ lstep3[1] = _mm_madd_epi16(lstep2[1], kOne);
+ lstep3[2] = _mm_madd_epi16(lstep2[2], kOne);
+ lstep3[3] = _mm_madd_epi16(lstep2[3], kOne);
+ lstep3[4] = _mm_madd_epi16(lstep2[4], kOne);
+ lstep3[5] = _mm_madd_epi16(lstep2[5], kOne);
+ lstep3[6] = _mm_madd_epi16(lstep2[6], kOne);
+ lstep3[7] = _mm_madd_epi16(lstep2[7], kOne);
+
+ lstep3[8] = _mm_madd_epi16(lstep2[6], k__pOne_mOne);
+ lstep3[9] = _mm_madd_epi16(lstep2[7], k__pOne_mOne);
+ lstep3[10] = _mm_madd_epi16(lstep2[4], k__pOne_mOne);
+ lstep3[11] = _mm_madd_epi16(lstep2[5], k__pOne_mOne);
+ lstep3[12] = _mm_madd_epi16(lstep2[2], k__pOne_mOne);
+ lstep3[13] = _mm_madd_epi16(lstep2[3], k__pOne_mOne);
+ lstep3[14] = _mm_madd_epi16(lstep2[0], k__pOne_mOne);
+ lstep3[15] = _mm_madd_epi16(lstep2[1], k__pOne_mOne);
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ }
+ {
+ lstep1[32] = _mm_unpacklo_epi16(step1[16], step2[23]);
+ lstep1[33] = _mm_unpackhi_epi16(step1[16], step2[23]);
+ lstep1[34] = _mm_unpacklo_epi16(step1[17], step2[22]);
+ lstep1[35] = _mm_unpackhi_epi16(step1[17], step2[22]);
+ lstep1[36] = _mm_unpacklo_epi16(step1[18], step2[21]);
+ lstep1[37] = _mm_unpackhi_epi16(step1[18], step2[21]);
+ lstep1[38] = _mm_unpacklo_epi16(step1[19], step2[20]);
+ lstep1[39] = _mm_unpackhi_epi16(step1[19], step2[20]);
+
+ lstep1[56] = _mm_unpacklo_epi16(step1[28], step2[27]);
+ lstep1[57] = _mm_unpackhi_epi16(step1[28], step2[27]);
+ lstep1[58] = _mm_unpacklo_epi16(step1[29], step2[26]);
+ lstep1[59] = _mm_unpackhi_epi16(step1[29], step2[26]);
+ lstep1[60] = _mm_unpacklo_epi16(step1[30], step2[25]);
+ lstep1[61] = _mm_unpackhi_epi16(step1[30], step2[25]);
+ lstep1[62] = _mm_unpacklo_epi16(step1[31], step2[24]);
+ lstep1[63] = _mm_unpackhi_epi16(step1[31], step2[24]);
+
+ lstep3[32] = _mm_madd_epi16(lstep1[32], kOne);
+ lstep3[33] = _mm_madd_epi16(lstep1[33], kOne);
+ lstep3[34] = _mm_madd_epi16(lstep1[34], kOne);
+ lstep3[35] = _mm_madd_epi16(lstep1[35], kOne);
+ lstep3[36] = _mm_madd_epi16(lstep1[36], kOne);
+ lstep3[37] = _mm_madd_epi16(lstep1[37], kOne);
+ lstep3[38] = _mm_madd_epi16(lstep1[38], kOne);
+ lstep3[39] = _mm_madd_epi16(lstep1[39], kOne);
+
+ lstep3[40] = _mm_madd_epi16(lstep1[38], k__pOne_mOne);
+ lstep3[41] = _mm_madd_epi16(lstep1[39], k__pOne_mOne);
+ lstep3[42] = _mm_madd_epi16(lstep1[36], k__pOne_mOne);
+ lstep3[43] = _mm_madd_epi16(lstep1[37], k__pOne_mOne);
+ lstep3[44] = _mm_madd_epi16(lstep1[34], k__pOne_mOne);
+ lstep3[45] = _mm_madd_epi16(lstep1[35], k__pOne_mOne);
+ lstep3[46] = _mm_madd_epi16(lstep1[32], k__pOne_mOne);
+ lstep3[47] = _mm_madd_epi16(lstep1[33], k__pOne_mOne);
+
+ lstep3[48] = _mm_madd_epi16(lstep1[62], k__pOne_mOne);
+ lstep3[49] = _mm_madd_epi16(lstep1[63], k__pOne_mOne);
+ lstep3[50] = _mm_madd_epi16(lstep1[60], k__pOne_mOne);
+ lstep3[51] = _mm_madd_epi16(lstep1[61], k__pOne_mOne);
+ lstep3[52] = _mm_madd_epi16(lstep1[58], k__pOne_mOne);
+ lstep3[53] = _mm_madd_epi16(lstep1[59], k__pOne_mOne);
+ lstep3[54] = _mm_madd_epi16(lstep1[56], k__pOne_mOne);
+ lstep3[55] = _mm_madd_epi16(lstep1[57], k__pOne_mOne);
+
+ lstep3[56] = _mm_madd_epi16(lstep1[56], kOne);
+ lstep3[57] = _mm_madd_epi16(lstep1[57], kOne);
+ lstep3[58] = _mm_madd_epi16(lstep1[58], kOne);
+ lstep3[59] = _mm_madd_epi16(lstep1[59], kOne);
+ lstep3[60] = _mm_madd_epi16(lstep1[60], kOne);
+ lstep3[61] = _mm_madd_epi16(lstep1[61], kOne);
+ lstep3[62] = _mm_madd_epi16(lstep1[62], kOne);
+ lstep3[63] = _mm_madd_epi16(lstep1[63], kOne);
+ }
+
+ // stage 4
+ {
+ // expanding to 32-bit length prior to addition operations
+ sign[0] = _mm_cmpgt_epi16(kZero, step2[8]);
+ sign[1] = _mm_cmpgt_epi16(kZero, step2[9]);
+ sign[2] = _mm_cmpgt_epi16(kZero, step2[14]);
+ sign[3] = _mm_cmpgt_epi16(kZero, step2[15]);
+ lstep2[16] = _mm_unpacklo_epi16(step2[8], sign[0]);
+ lstep2[17] = _mm_unpackhi_epi16(step2[8], sign[0]);
+ lstep2[18] = _mm_unpacklo_epi16(step2[9], sign[1]);
+ lstep2[19] = _mm_unpackhi_epi16(step2[9], sign[1]);
+ lstep2[28] = _mm_unpacklo_epi16(step2[14], sign[2]);
+ lstep2[29] = _mm_unpackhi_epi16(step2[14], sign[2]);
+ lstep2[30] = _mm_unpacklo_epi16(step2[15], sign[3]);
+ lstep2[31] = _mm_unpackhi_epi16(step2[15], sign[3]);
+
+ lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
+ lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
+ lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
+ lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
+ lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
+ lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
+ lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
+ lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
+ lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
+ lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
+ lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
+ lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
+ lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
+ lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
+ lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
+ lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
+ lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
+ lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
+ lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
+ lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
+ lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
+ lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
+ lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
+ lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
+ }
+ {
+ // to be continued...
+ //
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32(u[0], k32_p16_m16);
+ v[1] = k_madd_epi32(u[1], k32_p16_m16);
+ v[2] = k_madd_epi32(u[2], k32_p16_m16);
+ v[3] = k_madd_epi32(u[3], k32_p16_m16);
+ v[4] = k_madd_epi32(u[0], k32_p16_p16);
+ v[5] = k_madd_epi32(u[1], k32_p16_p16);
+ v[6] = k_madd_epi32(u[2], k32_p16_p16);
+ v[7] = k_madd_epi32(u[3], k32_p16_p16);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
+ &v[5], &v[6], &v[7], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ }
+ {
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
+ u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
+ u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
+ u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
+ u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
+ u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+ v[0] = k_madd_epi32(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32(u[4], k32_m08_p24);
+ v[5] = k_madd_epi32(u[5], k32_m08_p24);
+ v[6] = k_madd_epi32(u[6], k32_m08_p24);
+ v[7] = k_madd_epi32(u[7], k32_m08_p24);
+ v[8] = k_madd_epi32(u[8], k32_m24_m08);
+ v[9] = k_madd_epi32(u[9], k32_m24_m08);
+ v[10] = k_madd_epi32(u[10], k32_m24_m08);
+ v[11] = k_madd_epi32(u[11], k32_m24_m08);
+ v[12] = k_madd_epi32(u[12], k32_m24_m08);
+ v[13] = k_madd_epi32(u[13], k32_m24_m08);
+ v[14] = k_madd_epi32(u[14], k32_m24_m08);
+ v[15] = k_madd_epi32(u[15], k32_m24_m08);
+ v[16] = k_madd_epi32(u[12], k32_m08_p24);
+ v[17] = k_madd_epi32(u[13], k32_m08_p24);
+ v[18] = k_madd_epi32(u[14], k32_m08_p24);
+ v[19] = k_madd_epi32(u[15], k32_m08_p24);
+ v[20] = k_madd_epi32(u[8], k32_m08_p24);
+ v[21] = k_madd_epi32(u[9], k32_m08_p24);
+ v[22] = k_madd_epi32(u[10], k32_m08_p24);
+ v[23] = k_madd_epi32(u[11], k32_m08_p24);
+ v[24] = k_madd_epi32(u[4], k32_p24_p08);
+ v[25] = k_madd_epi32(u[5], k32_p24_p08);
+ v[26] = k_madd_epi32(u[6], k32_p24_p08);
+ v[27] = k_madd_epi32(u[7], k32_p24_p08);
+ v[28] = k_madd_epi32(u[0], k32_p24_p08);
+ v[29] = k_madd_epi32(u[1], k32_p24_p08);
+ v[30] = k_madd_epi32(u[2], k32_p24_p08);
+ v[31] = k_madd_epi32(u[3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 5
+ {
+ lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
+ lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
+ lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
+ lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
+ lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
+ lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
+ lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
+ lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
+ }
+ {
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
+ u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
+ u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
+ u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
+ u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
+ u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
+ u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
+ u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32(u[0], k32_p16_p16);
+ v[1] = k_madd_epi32(u[1], k32_p16_p16);
+ v[2] = k_madd_epi32(u[2], k32_p16_p16);
+ v[3] = k_madd_epi32(u[3], k32_p16_p16);
+ v[4] = k_madd_epi32(u[0], k32_p16_m16);
+ v[5] = k_madd_epi32(u[1], k32_p16_m16);
+ v[6] = k_madd_epi32(u[2], k32_p16_m16);
+ v[7] = k_madd_epi32(u[3], k32_p16_m16);
+ v[8] = k_madd_epi32(u[4], k32_p24_p08);
+ v[9] = k_madd_epi32(u[5], k32_p24_p08);
+ v[10] = k_madd_epi32(u[6], k32_p24_p08);
+ v[11] = k_madd_epi32(u[7], k32_p24_p08);
+ v[12] = k_madd_epi32(u[4], k32_m08_p24);
+ v[13] = k_madd_epi32(u[5], k32_m08_p24);
+ v[14] = k_madd_epi32(u[6], k32_m08_p24);
+ v[15] = k_madd_epi32(u[7], k32_m08_p24);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm_cmplt_epi32(u[0], kZero);
+ sign[1] = _mm_cmplt_epi32(u[1], kZero);
+ sign[2] = _mm_cmplt_epi32(u[2], kZero);
+ sign[3] = _mm_cmplt_epi32(u[3], kZero);
+ sign[4] = _mm_cmplt_epi32(u[4], kZero);
+ sign[5] = _mm_cmplt_epi32(u[5], kZero);
+ sign[6] = _mm_cmplt_epi32(u[6], kZero);
+ sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], sign[0]);
+ u[1] = _mm_sub_epi32(u[1], sign[1]);
+ u[2] = _mm_sub_epi32(u[2], sign[2]);
+ u[3] = _mm_sub_epi32(u[3], sign[3]);
+ u[4] = _mm_sub_epi32(u[4], sign[4]);
+ u[5] = _mm_sub_epi32(u[5], sign[5]);
+ u[6] = _mm_sub_epi32(u[6], sign[6]);
+ u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm_add_epi32(u[0], K32One);
+ u[1] = _mm_add_epi32(u[1], K32One);
+ u[2] = _mm_add_epi32(u[2], K32One);
+ u[3] = _mm_add_epi32(u[3], K32One);
+ u[4] = _mm_add_epi32(u[4], K32One);
+ u[5] = _mm_add_epi32(u[5], K32One);
+ u[6] = _mm_add_epi32(u[6], K32One);
+ u[7] = _mm_add_epi32(u[7], K32One);
+
+ u[0] = _mm_srai_epi32(u[0], 2);
+ u[1] = _mm_srai_epi32(u[1], 2);
+ u[2] = _mm_srai_epi32(u[2], 2);
+ u[3] = _mm_srai_epi32(u[3], 2);
+ u[4] = _mm_srai_epi32(u[4], 2);
+ u[5] = _mm_srai_epi32(u[5], 2);
+ u[6] = _mm_srai_epi32(u[6], 2);
+ u[7] = _mm_srai_epi32(u[7], 2);
+
+ // Combine
+ out[0] = _mm_packs_epi32(u[0], u[1]);
+ out[16] = _mm_packs_epi32(u[2], u[3]);
+ out[8] = _mm_packs_epi32(u[4], u[5]);
+ out[24] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
+ u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
+ u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
+ u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
+ u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
+ u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
+ u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
+ u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+ v[0] = k_madd_epi32(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32(u[4], k32_m24_m08);
+ v[5] = k_madd_epi32(u[5], k32_m24_m08);
+ v[6] = k_madd_epi32(u[6], k32_m24_m08);
+ v[7] = k_madd_epi32(u[7], k32_m24_m08);
+ v[8] = k_madd_epi32(u[4], k32_m08_p24);
+ v[9] = k_madd_epi32(u[5], k32_m08_p24);
+ v[10] = k_madd_epi32(u[6], k32_m08_p24);
+ v[11] = k_madd_epi32(u[7], k32_m08_p24);
+ v[12] = k_madd_epi32(u[0], k32_p24_p08);
+ v[13] = k_madd_epi32(u[1], k32_p24_p08);
+ v[14] = k_madd_epi32(u[2], k32_p24_p08);
+ v[15] = k_madd_epi32(u[3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ }
+ {
+ lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
+ lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
+ lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
+ lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
+ lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
+ lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
+ lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
+ lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
+ lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
+ lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
+ lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
+ lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
+ lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
+ lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
+ lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
+ lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
+ lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
+ lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
+ lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
+ lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
+ lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
+ lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
+ lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
+ lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
+ lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
+ lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
+ lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
+ lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
+ lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
+ lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
+ lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
+ lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
+ }
+ // stage 6
+ {
+ const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+ const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+ const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
+ u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
+
+ v[0] = k_madd_epi32(u[0], k32_p28_p04);
+ v[1] = k_madd_epi32(u[1], k32_p28_p04);
+ v[2] = k_madd_epi32(u[2], k32_p28_p04);
+ v[3] = k_madd_epi32(u[3], k32_p28_p04);
+ v[4] = k_madd_epi32(u[4], k32_p12_p20);
+ v[5] = k_madd_epi32(u[5], k32_p12_p20);
+ v[6] = k_madd_epi32(u[6], k32_p12_p20);
+ v[7] = k_madd_epi32(u[7], k32_p12_p20);
+ v[8] = k_madd_epi32(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32(u[12], k32_m04_p28);
+ v[13] = k_madd_epi32(u[13], k32_m04_p28);
+ v[14] = k_madd_epi32(u[14], k32_m04_p28);
+ v[15] = k_madd_epi32(u[15], k32_m04_p28);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm_cmplt_epi32(u[0], kZero);
+ sign[1] = _mm_cmplt_epi32(u[1], kZero);
+ sign[2] = _mm_cmplt_epi32(u[2], kZero);
+ sign[3] = _mm_cmplt_epi32(u[3], kZero);
+ sign[4] = _mm_cmplt_epi32(u[4], kZero);
+ sign[5] = _mm_cmplt_epi32(u[5], kZero);
+ sign[6] = _mm_cmplt_epi32(u[6], kZero);
+ sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], sign[0]);
+ u[1] = _mm_sub_epi32(u[1], sign[1]);
+ u[2] = _mm_sub_epi32(u[2], sign[2]);
+ u[3] = _mm_sub_epi32(u[3], sign[3]);
+ u[4] = _mm_sub_epi32(u[4], sign[4]);
+ u[5] = _mm_sub_epi32(u[5], sign[5]);
+ u[6] = _mm_sub_epi32(u[6], sign[6]);
+ u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm_add_epi32(u[0], K32One);
+ u[1] = _mm_add_epi32(u[1], K32One);
+ u[2] = _mm_add_epi32(u[2], K32One);
+ u[3] = _mm_add_epi32(u[3], K32One);
+ u[4] = _mm_add_epi32(u[4], K32One);
+ u[5] = _mm_add_epi32(u[5], K32One);
+ u[6] = _mm_add_epi32(u[6], K32One);
+ u[7] = _mm_add_epi32(u[7], K32One);
+
+ u[0] = _mm_srai_epi32(u[0], 2);
+ u[1] = _mm_srai_epi32(u[1], 2);
+ u[2] = _mm_srai_epi32(u[2], 2);
+ u[3] = _mm_srai_epi32(u[3], 2);
+ u[4] = _mm_srai_epi32(u[4], 2);
+ u[5] = _mm_srai_epi32(u[5], 2);
+ u[6] = _mm_srai_epi32(u[6], 2);
+ u[7] = _mm_srai_epi32(u[7], 2);
+
+ out[4] = _mm_packs_epi32(u[0], u[1]);
+ out[20] = _mm_packs_epi32(u[2], u[3]);
+ out[12] = _mm_packs_epi32(u[4], u[5]);
+ out[28] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
+ lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
+ lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
+ lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
+ lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
+ lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
+ lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
+ lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
+ lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
+ lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
+ lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
+ lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
+ lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
+ lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
+ lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
+ lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
+ }
+ {
+ const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+ const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
+ const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m128i k32_m12_m20 =
+ pair_set_epi32(-cospi_12_64, -cospi_20_64);
+ const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+ const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
+ u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
+ u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
+ u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
+ u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
+ u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
+ u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
+ u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
+ u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
+ u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
+ u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
+ u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
+ u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
+ u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
+ u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
+ u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+ v[0] = k_madd_epi32(u[0], k32_m04_p28);
+ v[1] = k_madd_epi32(u[1], k32_m04_p28);
+ v[2] = k_madd_epi32(u[2], k32_m04_p28);
+ v[3] = k_madd_epi32(u[3], k32_m04_p28);
+ v[4] = k_madd_epi32(u[4], k32_m28_m04);
+ v[5] = k_madd_epi32(u[5], k32_m28_m04);
+ v[6] = k_madd_epi32(u[6], k32_m28_m04);
+ v[7] = k_madd_epi32(u[7], k32_m28_m04);
+ v[8] = k_madd_epi32(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32(u[12], k32_m12_m20);
+ v[13] = k_madd_epi32(u[13], k32_m12_m20);
+ v[14] = k_madd_epi32(u[14], k32_m12_m20);
+ v[15] = k_madd_epi32(u[15], k32_m12_m20);
+ v[16] = k_madd_epi32(u[12], k32_m20_p12);
+ v[17] = k_madd_epi32(u[13], k32_m20_p12);
+ v[18] = k_madd_epi32(u[14], k32_m20_p12);
+ v[19] = k_madd_epi32(u[15], k32_m20_p12);
+ v[20] = k_madd_epi32(u[8], k32_p12_p20);
+ v[21] = k_madd_epi32(u[9], k32_p12_p20);
+ v[22] = k_madd_epi32(u[10], k32_p12_p20);
+ v[23] = k_madd_epi32(u[11], k32_p12_p20);
+ v[24] = k_madd_epi32(u[4], k32_m04_p28);
+ v[25] = k_madd_epi32(u[5], k32_m04_p28);
+ v[26] = k_madd_epi32(u[6], k32_m04_p28);
+ v[27] = k_madd_epi32(u[7], k32_m04_p28);
+ v[28] = k_madd_epi32(u[0], k32_p28_p04);
+ v[29] = k_madd_epi32(u[1], k32_p28_p04);
+ v[30] = k_madd_epi32(u[2], k32_p28_p04);
+ v[31] = k_madd_epi32(u[3], k32_p28_p04);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 7
+ {
+ const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
+ const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
+ const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
+ const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
+ const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
+ const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
+ const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
+ const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
+ u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
+ u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
+ u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
+ u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
+ u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
+ u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
+ u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
+ u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
+ u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
+ u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
+ u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
+ u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
+ u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
+ u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
+ u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+ v[0] = k_madd_epi32(u[0], k32_p30_p02);
+ v[1] = k_madd_epi32(u[1], k32_p30_p02);
+ v[2] = k_madd_epi32(u[2], k32_p30_p02);
+ v[3] = k_madd_epi32(u[3], k32_p30_p02);
+ v[4] = k_madd_epi32(u[4], k32_p14_p18);
+ v[5] = k_madd_epi32(u[5], k32_p14_p18);
+ v[6] = k_madd_epi32(u[6], k32_p14_p18);
+ v[7] = k_madd_epi32(u[7], k32_p14_p18);
+ v[8] = k_madd_epi32(u[8], k32_p22_p10);
+ v[9] = k_madd_epi32(u[9], k32_p22_p10);
+ v[10] = k_madd_epi32(u[10], k32_p22_p10);
+ v[11] = k_madd_epi32(u[11], k32_p22_p10);
+ v[12] = k_madd_epi32(u[12], k32_p06_p26);
+ v[13] = k_madd_epi32(u[13], k32_p06_p26);
+ v[14] = k_madd_epi32(u[14], k32_p06_p26);
+ v[15] = k_madd_epi32(u[15], k32_p06_p26);
+ v[16] = k_madd_epi32(u[12], k32_m26_p06);
+ v[17] = k_madd_epi32(u[13], k32_m26_p06);
+ v[18] = k_madd_epi32(u[14], k32_m26_p06);
+ v[19] = k_madd_epi32(u[15], k32_m26_p06);
+ v[20] = k_madd_epi32(u[8], k32_m10_p22);
+ v[21] = k_madd_epi32(u[9], k32_m10_p22);
+ v[22] = k_madd_epi32(u[10], k32_m10_p22);
+ v[23] = k_madd_epi32(u[11], k32_m10_p22);
+ v[24] = k_madd_epi32(u[4], k32_m18_p14);
+ v[25] = k_madd_epi32(u[5], k32_m18_p14);
+ v[26] = k_madd_epi32(u[6], k32_m18_p14);
+ v[27] = k_madd_epi32(u[7], k32_m18_p14);
+ v[28] = k_madd_epi32(u[0], k32_m02_p30);
+ v[29] = k_madd_epi32(u[1], k32_m02_p30);
+ v[30] = k_madd_epi32(u[2], k32_m02_p30);
+ v[31] = k_madd_epi32(u[3], k32_m02_p30);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[2] = _mm_packs_epi32(u[0], u[1]);
+ out[18] = _mm_packs_epi32(u[2], u[3]);
+ out[10] = _mm_packs_epi32(u[4], u[5]);
+ out[26] = _mm_packs_epi32(u[6], u[7]);
+ out[6] = _mm_packs_epi32(u[8], u[9]);
+ out[22] = _mm_packs_epi32(u[10], u[11]);
+ out[14] = _mm_packs_epi32(u[12], u[13]);
+ out[30] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
+ &out[6], &out[22], &out[14], &out[30]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
+ lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
+ lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
+ lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
+ lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
+ lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
+ lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
+ lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
+ lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
+ lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
+ lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
+ lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
+ lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
+ lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
+ lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
+ lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
+ lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
+ lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
+ lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
+ lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
+ lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
+ lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
+ lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
+ lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
+ lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
+ lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
+ lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
+ lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
+ lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
+ lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
+ lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
+ lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
+ }
+ // stage 8
+ {
+ const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
+ const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
+ const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
+ const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
+ const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
+ const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
+ const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
+ const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
+ u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
+ u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
+ u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
+ u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
+ u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
+ u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
+ u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
+ u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
+ u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
+ u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
+ u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
+ u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
+ u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
+ u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
+ u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+ v[0] = k_madd_epi32(u[0], k32_p31_p01);
+ v[1] = k_madd_epi32(u[1], k32_p31_p01);
+ v[2] = k_madd_epi32(u[2], k32_p31_p01);
+ v[3] = k_madd_epi32(u[3], k32_p31_p01);
+ v[4] = k_madd_epi32(u[4], k32_p15_p17);
+ v[5] = k_madd_epi32(u[5], k32_p15_p17);
+ v[6] = k_madd_epi32(u[6], k32_p15_p17);
+ v[7] = k_madd_epi32(u[7], k32_p15_p17);
+ v[8] = k_madd_epi32(u[8], k32_p23_p09);
+ v[9] = k_madd_epi32(u[9], k32_p23_p09);
+ v[10] = k_madd_epi32(u[10], k32_p23_p09);
+ v[11] = k_madd_epi32(u[11], k32_p23_p09);
+ v[12] = k_madd_epi32(u[12], k32_p07_p25);
+ v[13] = k_madd_epi32(u[13], k32_p07_p25);
+ v[14] = k_madd_epi32(u[14], k32_p07_p25);
+ v[15] = k_madd_epi32(u[15], k32_p07_p25);
+ v[16] = k_madd_epi32(u[12], k32_m25_p07);
+ v[17] = k_madd_epi32(u[13], k32_m25_p07);
+ v[18] = k_madd_epi32(u[14], k32_m25_p07);
+ v[19] = k_madd_epi32(u[15], k32_m25_p07);
+ v[20] = k_madd_epi32(u[8], k32_m09_p23);
+ v[21] = k_madd_epi32(u[9], k32_m09_p23);
+ v[22] = k_madd_epi32(u[10], k32_m09_p23);
+ v[23] = k_madd_epi32(u[11], k32_m09_p23);
+ v[24] = k_madd_epi32(u[4], k32_m17_p15);
+ v[25] = k_madd_epi32(u[5], k32_m17_p15);
+ v[26] = k_madd_epi32(u[6], k32_m17_p15);
+ v[27] = k_madd_epi32(u[7], k32_m17_p15);
+ v[28] = k_madd_epi32(u[0], k32_m01_p31);
+ v[29] = k_madd_epi32(u[1], k32_m01_p31);
+ v[30] = k_madd_epi32(u[2], k32_m01_p31);
+ v[31] = k_madd_epi32(u[3], k32_m01_p31);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[1] = _mm_packs_epi32(u[0], u[1]);
+ out[17] = _mm_packs_epi32(u[2], u[3]);
+ out[9] = _mm_packs_epi32(u[4], u[5]);
+ out[25] = _mm_packs_epi32(u[6], u[7]);
+ out[7] = _mm_packs_epi32(u[8], u[9]);
+ out[23] = _mm_packs_epi32(u[10], u[11]);
+ out[15] = _mm_packs_epi32(u[12], u[13]);
+ out[31] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
+ &out[7], &out[23], &out[15], &out[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
+ const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
+ const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
+ const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
+ const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
+ const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
+ const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
+ const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
+ u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
+ u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
+ u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
+ u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
+ u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
+ u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
+ u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
+ u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
+ u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
+ u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
+ u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
+ u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
+ u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
+ u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
+ u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+ v[0] = k_madd_epi32(u[0], k32_p27_p05);
+ v[1] = k_madd_epi32(u[1], k32_p27_p05);
+ v[2] = k_madd_epi32(u[2], k32_p27_p05);
+ v[3] = k_madd_epi32(u[3], k32_p27_p05);
+ v[4] = k_madd_epi32(u[4], k32_p11_p21);
+ v[5] = k_madd_epi32(u[5], k32_p11_p21);
+ v[6] = k_madd_epi32(u[6], k32_p11_p21);
+ v[7] = k_madd_epi32(u[7], k32_p11_p21);
+ v[8] = k_madd_epi32(u[8], k32_p19_p13);
+ v[9] = k_madd_epi32(u[9], k32_p19_p13);
+ v[10] = k_madd_epi32(u[10], k32_p19_p13);
+ v[11] = k_madd_epi32(u[11], k32_p19_p13);
+ v[12] = k_madd_epi32(u[12], k32_p03_p29);
+ v[13] = k_madd_epi32(u[13], k32_p03_p29);
+ v[14] = k_madd_epi32(u[14], k32_p03_p29);
+ v[15] = k_madd_epi32(u[15], k32_p03_p29);
+ v[16] = k_madd_epi32(u[12], k32_m29_p03);
+ v[17] = k_madd_epi32(u[13], k32_m29_p03);
+ v[18] = k_madd_epi32(u[14], k32_m29_p03);
+ v[19] = k_madd_epi32(u[15], k32_m29_p03);
+ v[20] = k_madd_epi32(u[8], k32_m13_p19);
+ v[21] = k_madd_epi32(u[9], k32_m13_p19);
+ v[22] = k_madd_epi32(u[10], k32_m13_p19);
+ v[23] = k_madd_epi32(u[11], k32_m13_p19);
+ v[24] = k_madd_epi32(u[4], k32_m21_p11);
+ v[25] = k_madd_epi32(u[5], k32_m21_p11);
+ v[26] = k_madd_epi32(u[6], k32_m21_p11);
+ v[27] = k_madd_epi32(u[7], k32_m21_p11);
+ v[28] = k_madd_epi32(u[0], k32_m05_p27);
+ v[29] = k_madd_epi32(u[1], k32_m05_p27);
+ v[30] = k_madd_epi32(u[2], k32_m05_p27);
+ v[31] = k_madd_epi32(u[3], k32_m05_p27);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[5] = _mm_packs_epi32(u[0], u[1]);
+ out[21] = _mm_packs_epi32(u[2], u[3]);
+ out[13] = _mm_packs_epi32(u[4], u[5]);
+ out[29] = _mm_packs_epi32(u[6], u[7]);
+ out[3] = _mm_packs_epi32(u[8], u[9]);
+ out[19] = _mm_packs_epi32(u[10], u[11]);
+ out[11] = _mm_packs_epi32(u[12], u[13]);
+ out[27] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
+ &out[3], &out[19], &out[11], &out[27]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+#endif // FDCT32x32_HIGH_PRECISION
+ // Transpose the results, do it as four 8x8 transposes.
+ {
+ int transpose_block;
+ int16_t *output0 = &intermediate[column_start * 32];
+ tran_low_t *output1 = &output_org[column_start * 32];
+ for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+ __m128i *this_out = &out[8 * transpose_block];
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ if (0 == pass) {
+ // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+ // TODO(cd): see quality impact of only doing
+ // output[j] = (output[j] + 1) >> 2;
+ // which would remove the code between here ...
+ __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
+ __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
+ __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
+ __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
+ __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
+ __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
+ __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
+ __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
+ tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
+ tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
+ tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
+ tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
+ tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
+ tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
+ tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
+ tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
+ // ... and here.
+ // PS: also change code in vp9/encoder/vp9_dct.c
+ tr2_0 = _mm_add_epi16(tr2_0, kOne);
+ tr2_1 = _mm_add_epi16(tr2_1, kOne);
+ tr2_2 = _mm_add_epi16(tr2_2, kOne);
+ tr2_3 = _mm_add_epi16(tr2_3, kOne);
+ tr2_4 = _mm_add_epi16(tr2_4, kOne);
+ tr2_5 = _mm_add_epi16(tr2_5, kOne);
+ tr2_6 = _mm_add_epi16(tr2_6, kOne);
+ tr2_7 = _mm_add_epi16(tr2_7, kOne);
+ tr2_0 = _mm_srai_epi16(tr2_0, 2);
+ tr2_1 = _mm_srai_epi16(tr2_1, 2);
+ tr2_2 = _mm_srai_epi16(tr2_2, 2);
+ tr2_3 = _mm_srai_epi16(tr2_3, 2);
+ tr2_4 = _mm_srai_epi16(tr2_4, 2);
+ tr2_5 = _mm_srai_epi16(tr2_5, 2);
+ tr2_6 = _mm_srai_epi16(tr2_6, 2);
+ tr2_7 = _mm_srai_epi16(tr2_7, 2);
+ }
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (pass == 0) {
+ _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
+ _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
+ _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
+ _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
+ _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
+ _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
+ _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
+ _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
+ // Process next 8x8
+ output0 += 8;
+ } else {
+ storeu_output(&tr2_0, (output1 + 0 * 32));
+ storeu_output(&tr2_1, (output1 + 1 * 32));
+ storeu_output(&tr2_2, (output1 + 2 * 32));
+ storeu_output(&tr2_3, (output1 + 3 * 32));
+ storeu_output(&tr2_4, (output1 + 4 * 32));
+ storeu_output(&tr2_5, (output1 + 5 * 32));
+ storeu_output(&tr2_6, (output1 + 6 * 32));
+ storeu_output(&tr2_7, (output1 + 7 * 32));
+ // Process next 8x8
+ output1 += 8;
+ }
+ }
+ }
+ }
+ }
+} // NOLINT
+
+#undef ADD_EPI16
+#undef SUB_EPI16
+#undef HIGH_FDCT32x32_2D_C
+#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
new file mode 100644
index 0000000000..c8f54a49cb
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/txfm_common.h"
+#define ADD256_EPI16 _mm256_add_epi16
+#define SUB256_EPI16 _mm256_sub_epi16
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+ int stride, __m256i *out,
+ int out_size, int pass) {
+ int i;
+ const __m256i kOne = _mm256_set1_epi16(1);
+ if (pass == 0) {
+ for (i = 0; i < out_size; i++) {
+ out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
+ // x = x << 2
+ out[i] = _mm256_slli_epi16(out[i], 2);
+ }
+ } else {
+ for (i = 0; i < out_size; i++) {
+ out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16));
+ // x = (x + 1) >> 2
+ out[i] = _mm256_add_epi16(out[i], kOne);
+ out[i] = _mm256_srai_epi16(out[i], 2);
+ }
+ }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+ __m256i *const out) {
+ int i;
+ __m256i t[16], u[16];
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 1) ==> (0, 1)
+ // (2, 3) ==> (2, 3)
+ // (4, 5) ==> (4, 5)
+ // (6, 7) ==> (6, 7)
+ for (i = 0; i < 4; i++) {
+ t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+ t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 2) ==> (0, 2)
+ // (1, 3) ==> (1, 3)
+ // (4, 6) ==> (4, 6)
+ // (5, 7) ==> (5, 7)
+ for (i = 0; i < 2; i++) {
+ u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+ u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+ u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+ u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 4) ==> (0, 1)
+ // (1, 5) ==> (4, 5)
+ // (2, 6) ==> (2, 3)
+ // (3, 7) ==> (6, 7)
+ for (i = 0; i < 2; i++) {
+ out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+ out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+ out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+ out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+ }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+ __m256i *const out) {
+ __m256i t[16];
+
+#define LOADL(idx) \
+ t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+ t[idx] = _mm256_inserti128_si256( \
+ t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx) \
+ t[8 + idx] = \
+ _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+ t[8 + idx] = _mm256_inserti128_si256( \
+ t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+ // load left 8x16
+ LOADL(0)
+ LOADL(1)
+ LOADL(2)
+ LOADL(3)
+ LOADL(4)
+ LOADL(5)
+ LOADL(6)
+ LOADL(7)
+
+ // load right 8x16
+ LOADR(0)
+ LOADR(1)
+ LOADR(2)
+ LOADR(3)
+ LOADR(4)
+ LOADR(5)
+ LOADR(6)
+ LOADR(7)
+
+ // get the top 16x8 result
+ transpose2_8x8_avx2(t, out);
+ // get the bottom 16x8 result
+ transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+// Store 8 16-bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+ tran_low_t *out,
+ const int stride,
+ const int out_size) {
+ int i;
+ for (i = 0; i < out_size; ++i) {
+ _mm256_storeu_si256((__m256i *)(out), in[i]);
+ out += stride;
+ }
+}
+
+#define PAIR256_SET_EPI16(a, b) \
+ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE __m256i mult256_round_shift(const __m256i *pin0,
+ const __m256i *pin1,
+ const __m256i *pmultiplier,
+ const __m256i *prounding,
+ const int shift) {
+ const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier);
+ const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier);
+ const __m256i v0 = _mm256_add_epi32(u0, *prounding);
+ const __m256i v1 = _mm256_add_epi32(u1, *prounding);
+ const __m256i w0 = _mm256_srai_epi32(v0, shift);
+ const __m256i w1 = _mm256_srai_epi32(v1, shift);
+ return _mm256_packs_epi32(w0, w1);
+}
+
+static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) {
+ int i;
+ __m256i step2[4];
+ __m256i in[8];
+ __m256i step1[8];
+ __m256i step3[8];
+
+ const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
+ const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64);
+ const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64);
+ const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64);
+ const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64);
+ const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64);
+ const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64);
+ const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64);
+ const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64);
+ const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64);
+ const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64);
+ const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64);
+ const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64);
+ const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64);
+ const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64);
+ const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64);
+ const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64);
+ const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+
+ // Calculate input for the first 8 results.
+ for (i = 0; i < 8; i++) {
+ in[i] = ADD256_EPI16(input[i], input[15 - i]);
+ }
+
+ // Calculate input for the next 8 results.
+ for (i = 0; i < 8; i++) {
+ step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]);
+ }
+
+ // Work on the first eight values; fdct8(input, even_results);
+ {
+ // Add/subtract
+ const __m256i q0 = ADD256_EPI16(in[0], in[7]);
+ const __m256i q1 = ADD256_EPI16(in[1], in[6]);
+ const __m256i q2 = ADD256_EPI16(in[2], in[5]);
+ const __m256i q3 = ADD256_EPI16(in[3], in[4]);
+ const __m256i q4 = SUB256_EPI16(in[3], in[4]);
+ const __m256i q5 = SUB256_EPI16(in[2], in[5]);
+ const __m256i q6 = SUB256_EPI16(in[1], in[6]);
+ const __m256i q7 = SUB256_EPI16(in[0], in[7]);
+
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m256i r0 = ADD256_EPI16(q0, q3);
+ const __m256i r1 = ADD256_EPI16(q1, q2);
+ const __m256i r2 = SUB256_EPI16(q1, q2);
+ const __m256i r3 = SUB256_EPI16(q0, q3);
+
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(r0, r1);
+ const __m256i t1 = _mm256_unpackhi_epi16(r0, r1);
+ const __m256i t2 = _mm256_unpacklo_epi16(r2, r3);
+ const __m256i t3 = _mm256_unpackhi_epi16(r2, r3);
+
+ output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[12] =
+ mult256_round_shift(&t2, &t3, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ }
+
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m256i d0 = _mm256_unpacklo_epi16(q6, q5);
+ const __m256i d1 = _mm256_unpackhi_epi16(q6, q5);
+ const __m256i r0 = mult256_round_shift(
+ &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ const __m256i r1 = mult256_round_shift(
+ &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+
+ {
+ // Add/subtract
+ const __m256i x0 = ADD256_EPI16(q4, r0);
+ const __m256i x1 = SUB256_EPI16(q4, r0);
+ const __m256i x2 = SUB256_EPI16(q7, r1);
+ const __m256i x3 = ADD256_EPI16(q7, r1);
+
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(x0, x3);
+ const __m256i t1 = _mm256_unpackhi_epi16(x0, x3);
+ const __m256i t2 = _mm256_unpacklo_epi16(x1, x2);
+ const __m256i t3 = _mm256_unpackhi_epi16(x1, x2);
+ output[2] =
+ mult256_round_shift(&t0, &t1, &k__cospi_p28_p04,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[14] =
+ mult256_round_shift(&t0, &t1, &k__cospi_m04_p28,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[10] =
+ mult256_round_shift(&t2, &t3, &k__cospi_p12_p20,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[6] =
+ mult256_round_shift(&t2, &t3, &k__cospi_m20_p12,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ }
+ }
+ }
+ // Work on the next eight values; step1 -> odd_results
+ { // step 2
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]);
+ step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ // step 3
+ {
+ step3[0] = ADD256_EPI16(step1[0], step2[1]);
+ step3[1] = ADD256_EPI16(step1[1], step2[0]);
+ step3[2] = SUB256_EPI16(step1[1], step2[0]);
+ step3[3] = SUB256_EPI16(step1[0], step2[1]);
+ step3[4] = SUB256_EPI16(step1[7], step2[3]);
+ step3[5] = SUB256_EPI16(step1[6], step2[2]);
+ step3[6] = ADD256_EPI16(step1[6], step2[2]);
+ step3[7] = ADD256_EPI16(step1[7], step2[3]);
+ }
+ // step 4
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]);
+ step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ // step 5
+ {
+ step1[0] = ADD256_EPI16(step3[0], step2[0]);
+ step1[1] = SUB256_EPI16(step3[0], step2[0]);
+ step1[2] = ADD256_EPI16(step3[3], step2[1]);
+ step1[3] = SUB256_EPI16(step3[3], step2[1]);
+ step1[4] = SUB256_EPI16(step3[4], step2[3]);
+ step1[5] = ADD256_EPI16(step3[4], step2[3]);
+ step1[6] = SUB256_EPI16(step3[7], step2[2]);
+ step1[7] = ADD256_EPI16(step3[7], step2[2]);
+ }
+ // step 6
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]);
+ output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]);
+ output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ }
+}
+
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) {
+ int pass;
+ DECLARE_ALIGNED(32, int16_t, intermediate[256]);
+ int16_t *out0 = intermediate;
+ tran_low_t *out1 = output;
+ const int width = 16;
+ const int height = 16;
+ __m256i buf0[16], buf1[16];
+
+ // Two transform and transpose passes
+ // Process 16 columns (transposed rows in second pass) at a time.
+ for (pass = 0; pass < 2; ++pass) {
+ // Load and pre-condition input.
+ load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass);
+
+ // Calculate dct for 16x16 values
+ fdct16x16_1D_avx2(buf1, buf0);
+
+ // Transpose the results.
+ transpose_16bit_16x16_avx2(buf0, buf1);
+
+ if (pass == 0) {
+ store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height);
+ } else {
+ store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height);
+ }
+ // Setup in/out for next pass.
+ input = intermediate;
+ }
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+#define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D_AVX2 vpx_fdct32x32_avx2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
+#endif // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
new file mode 100644
index 0000000000..d546f02a14
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -0,0 +1,1015 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // This 2D transform implements 4 vertical 1D transforms followed
+ // by 4 horizontal 1D transforms. The multiplies and adds are as given
+ // by Chen, Smith and Fralick ('77). The commands for moving the data
+ // around have been minimized by hand.
+ // For the purposes of the comments, the 16 inputs are referred to at i0
+ // through iF (in raster order), intermediate variables are a0, b0, c0
+ // through f, and correspond to the in-place computations mapped to input
+ // locations. The outputs, o0 through oF are labeled according to the
+ // output locations.
+
+ // Constants
+ // These are the coefficients used for the multiplies.
+ // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+ // where cospi_N_64 = cos(N pi /64)
+ const __m128i k__cospi_A =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_B =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_C =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_D =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_E =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_F =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_G =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_H =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
+
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // This second rounding constant saves doing some extra adds at the end
+ const __m128i k__DCT_CONST_ROUNDING2 =
+ _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
+ const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ __m128i in0, in1;
+#if DCT_HIGH_BIT_DEPTH
+ __m128i cmp0, cmp1;
+ int test, overflow;
+#endif
+
+ // Load inputs.
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(
+ in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(
+ in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+// in0 = [i0 i1 i2 i3 iC iD iE iF]
+// in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+#if DCT_HIGH_BIT_DEPTH
+ // Check inputs small enough to use optimised code
+ cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
+ _mm_cmplt_epi16(in0, _mm_set1_epi16((int16_t)0xfc00)));
+ cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
+ _mm_cmplt_epi16(in1, _mm_set1_epi16((int16_t)0xfc00)));
+ test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
+ if (test) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // multiply by 16 to give some extra precision
+ in0 = _mm_slli_epi16(in0, 4);
+ in1 = _mm_slli_epi16(in1, 4);
+ // if (i == 0 && input[0]) input[0] += 1;
+ // add 1 to the upper left pixel if it is non-zero, which helps reduce
+ // the round-trip error
+ {
+ // The mask will only contain whether the first value is zero, all
+ // other comparison will fail as something shifted by 4 (above << 4)
+ // can never be equal to one. To increment in the non-zero case, we
+ // add the mask and one for the first element:
+ // - if zero, mask = -1, v = v - 1 + 1 = v
+ // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+ __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+ in0 = _mm_add_epi16(in0, mask);
+ in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+ }
+ // There are 4 total stages, alternating between an add/subtract stage
+ // followed by an multiply-and-add stage.
+ {
+ // Stage 1: Add/subtract
+
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+ const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
+ const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
+ // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+ // r1 = [iC i8 iD i9 iE iA iF iB]
+ const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+ const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+ // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+ // r3 = [iC i8 iD i9 iF iB iE iA]
+
+ const __m128i t0 = _mm_add_epi16(r2, r3);
+ const __m128i t1 = _mm_sub_epi16(r2, r3);
+ // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+ // t1 = [aC a8 aD a9 aF aB aE aA]
+
+ // Stage 2: multiply by constants (which gets us into 32 bits).
+ // The constants needed here are:
+ // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+ // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+ // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+ // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+ // Then add and right-shift to get back to 16-bit range
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // w0 = [b0 b1 b7 b6]
+ // w1 = [b8 b9 bF bE]
+ // w2 = [b4 b5 b3 b2]
+ // w3 = [bC bD bB bA]
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&x0, &x1);
+ if (overflow) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+ // x1 = [b4 b5 b3 b2 bC bD bB bA]
+ in0 = _mm_shuffle_epi32(x0, 0xD8);
+ in1 = _mm_shuffle_epi32(x1, 0x8D);
+ // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+ // in1 = [b3 b2 bB bA b4 b5 bC bD]
+ }
+ {
+ // vertical DCTs finished. Now we do the horizontal DCTs.
+ // Stage 3: Add/subtract
+
+ const __m128i t0 = ADD_EPI16(in0, in1);
+ const __m128i t1 = SUB_EPI16(in0, in1);
+// t0 = [c0 c1 c8 c9 c4 c5 cC cD]
+// t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&t0, &t1);
+ if (overflow) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // Stage 4: multiply by constants (which gets us into 32 bits).
+ {
+ // The constants needed here are:
+ // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+ // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+ // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+ // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+ const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+ const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+ // Then add and right-shift to get back to 16-bit range
+ // but this combines the final right-shift as well to save operations
+ // This unusual rounding operations is to maintain bit-accurate
+ // compatibility with the c version of this function which has two
+ // rounding steps in a row.
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+ // w0 = [o0 o4 o8 oC]
+ // w1 = [o2 o6 oA oE]
+ // w2 = [o1 o5 o9 oD]
+ // w3 = [o3 o7 oB oF]
+ // remember the o's are numbered according to the correct output location
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&x0, &x1);
+ if (overflow) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+ // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+ const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+ const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+ // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+ // y1 = [o2 o3 o6 o7 oA oB oE oF]
+ in0 = _mm_unpacklo_epi32(y0, y1);
+ // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+ in1 = _mm_unpackhi_epi32(y0, y1);
+ // in1 = [o8 o9 oA oB oC oD oE oF]
+ }
+ }
+ }
+ // Post-condition (v + 1) >> 2 is now incorporated into previous
+ // add and right-shift commands. Only 2 store instructions needed
+ // because we are using the fact that 1/3 are stored just after 0/2.
+ storeu_output(&in0, output + 0 * 4);
+ storeu_output(&in1, output + 2 * 4);
+}
+
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ // Load input
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ // Pre-condition input (shift by two)
+ in0 = _mm_slli_epi16(in0, 2);
+ in1 = _mm_slli_epi16(in1, 2);
+ in2 = _mm_slli_epi16(in2, 2);
+ in3 = _mm_slli_epi16(in3, 2);
+ in4 = _mm_slli_epi16(in4, 2);
+ in5 = _mm_slli_epi16(in5, 2);
+ in6 = _mm_slli_epi16(in6, 2);
+ in7 = _mm_slli_epi16(in7, 2);
+
+ // We do two passes, first the columns, then the rows. The results of the
+ // first pass are transposed so that the same column code can be reused. The
+ // results of the second pass are also transposed so that the rows (processed
+ // as columns) are put back in row positions.
+ for (pass = 0; pass < 2; pass++) {
+ // To store results of each pass before the transpose.
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(in0, in7);
+ const __m128i q1 = ADD_EPI16(in1, in6);
+ const __m128i q2 = ADD_EPI16(in2, in5);
+ const __m128i q3 = ADD_EPI16(in3, in4);
+ const __m128i q4 = SUB_EPI16(in3, in4);
+ const __m128i q5 = SUB_EPI16(in2, in5);
+ const __m128i q6 = SUB_EPI16(in1, in6);
+ const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+ if (pass == 1) {
+ overflow =
+ check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+ const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+ const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+ const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+ const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+ const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+ const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+ const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+ const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+ const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+ const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+ // Combine
+ const __m128i r0 = _mm_packs_epi32(s0, s1);
+ const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res1 = _mm_packs_epi32(w0, w1);
+ res7 = _mm_packs_epi32(w2, w3);
+ res5 = _mm_packs_epi32(w4, w5);
+ res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ // Transpose the 8x8.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ }
+ // Post-condition output and store it
+ {
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+ const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+ const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+ const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+ const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+ const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+ const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+ const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+ in0 = _mm_sub_epi16(in0, sign_in0);
+ in1 = _mm_sub_epi16(in1, sign_in1);
+ in2 = _mm_sub_epi16(in2, sign_in2);
+ in3 = _mm_sub_epi16(in3, sign_in3);
+ in4 = _mm_sub_epi16(in4, sign_in4);
+ in5 = _mm_sub_epi16(in5, sign_in5);
+ in6 = _mm_sub_epi16(in6, sign_in6);
+ in7 = _mm_sub_epi16(in7, sign_in7);
+ in0 = _mm_srai_epi16(in0, 1);
+ in1 = _mm_srai_epi16(in1, 1);
+ in2 = _mm_srai_epi16(in2, 1);
+ in3 = _mm_srai_epi16(in3, 1);
+ in4 = _mm_srai_epi16(in4, 1);
+ in5 = _mm_srai_epi16(in5, 1);
+ in6 = _mm_srai_epi16(in6, 1);
+ in7 = _mm_srai_epi16(in7, 1);
+ // store results
+ store_output(&in0, (output + 0 * 8));
+ store_output(&in1, (output + 1 * 8));
+ store_output(&in2, (output + 2 * 8));
+ store_output(&in3, (output + 3 * 8));
+ store_output(&in4, (output + 4 * 8));
+ store_output(&in5, (output + 5 * 8));
+ store_output(&in6, (output + 6 * 8));
+ store_output(&in7, (output + 7 * 8));
+ }
+}
+
+void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we transpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[256]);
+ const int16_t *in = input;
+ int16_t *out0 = intermediate;
+ tran_low_t *out1 = output;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ for (column_start = 0; column_start < 16; column_start += 8) {
+ __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+ __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+ __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+ __m128i step1_0, step1_1, step1_2, step1_3;
+ __m128i step1_4, step1_5, step1_6, step1_7;
+ __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ __m128i step3_0, step3_1, step3_2, step3_3;
+ __m128i step3_4, step3_5, step3_6, step3_7;
+ __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+ __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+ // Load and pre-condition input.
+ if (0 == pass) {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
+ // x = x << 2
+ in00 = _mm_slli_epi16(in00, 2);
+ in01 = _mm_slli_epi16(in01, 2);
+ in02 = _mm_slli_epi16(in02, 2);
+ in03 = _mm_slli_epi16(in03, 2);
+ in04 = _mm_slli_epi16(in04, 2);
+ in05 = _mm_slli_epi16(in05, 2);
+ in06 = _mm_slli_epi16(in06, 2);
+ in07 = _mm_slli_epi16(in07, 2);
+ in08 = _mm_slli_epi16(in08, 2);
+ in09 = _mm_slli_epi16(in09, 2);
+ in10 = _mm_slli_epi16(in10, 2);
+ in11 = _mm_slli_epi16(in11, 2);
+ in12 = _mm_slli_epi16(in12, 2);
+ in13 = _mm_slli_epi16(in13, 2);
+ in14 = _mm_slli_epi16(in14, 2);
+ in15 = _mm_slli_epi16(in15, 2);
+ } else {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
+ // x = (x + 1) >> 2
+ in00 = _mm_add_epi16(in00, kOne);
+ in01 = _mm_add_epi16(in01, kOne);
+ in02 = _mm_add_epi16(in02, kOne);
+ in03 = _mm_add_epi16(in03, kOne);
+ in04 = _mm_add_epi16(in04, kOne);
+ in05 = _mm_add_epi16(in05, kOne);
+ in06 = _mm_add_epi16(in06, kOne);
+ in07 = _mm_add_epi16(in07, kOne);
+ in08 = _mm_add_epi16(in08, kOne);
+ in09 = _mm_add_epi16(in09, kOne);
+ in10 = _mm_add_epi16(in10, kOne);
+ in11 = _mm_add_epi16(in11, kOne);
+ in12 = _mm_add_epi16(in12, kOne);
+ in13 = _mm_add_epi16(in13, kOne);
+ in14 = _mm_add_epi16(in14, kOne);
+ in15 = _mm_add_epi16(in15, kOne);
+ in00 = _mm_srai_epi16(in00, 2);
+ in01 = _mm_srai_epi16(in01, 2);
+ in02 = _mm_srai_epi16(in02, 2);
+ in03 = _mm_srai_epi16(in03, 2);
+ in04 = _mm_srai_epi16(in04, 2);
+ in05 = _mm_srai_epi16(in05, 2);
+ in06 = _mm_srai_epi16(in06, 2);
+ in07 = _mm_srai_epi16(in07, 2);
+ in08 = _mm_srai_epi16(in08, 2);
+ in09 = _mm_srai_epi16(in09, 2);
+ in10 = _mm_srai_epi16(in10, 2);
+ in11 = _mm_srai_epi16(in11, 2);
+ in12 = _mm_srai_epi16(in12, 2);
+ in13 = _mm_srai_epi16(in13, 2);
+ in14 = _mm_srai_epi16(in14, 2);
+ in15 = _mm_srai_epi16(in15, 2);
+ }
+ in += 8;
+ // Calculate input for the first 8 results.
+ {
+ input0 = ADD_EPI16(in00, in15);
+ input1 = ADD_EPI16(in01, in14);
+ input2 = ADD_EPI16(in02, in13);
+ input3 = ADD_EPI16(in03, in12);
+ input4 = ADD_EPI16(in04, in11);
+ input5 = ADD_EPI16(in05, in10);
+ input6 = ADD_EPI16(in06, in09);
+ input7 = ADD_EPI16(in07, in08);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
+ &input4, &input5, &input6, &input7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Calculate input for the next 8 results.
+ {
+ step1_0 = SUB_EPI16(in07, in08);
+ step1_1 = SUB_EPI16(in06, in09);
+ step1_2 = SUB_EPI16(in05, in10);
+ step1_3 = SUB_EPI16(in04, in11);
+ step1_4 = SUB_EPI16(in03, in12);
+ step1_5 = SUB_EPI16(in02, in13);
+ step1_6 = SUB_EPI16(in01, in14);
+ step1_7 = SUB_EPI16(in00, in15);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
+ &step1_4, &step1_5, &step1_6, &step1_7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Work on the first eight values; fdct8(input, even_results);
+ {
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(input0, input7);
+ const __m128i q1 = ADD_EPI16(input1, input6);
+ const __m128i q2 = ADD_EPI16(input2, input5);
+ const __m128i q3 = ADD_EPI16(input3, input4);
+ const __m128i q4 = SUB_EPI16(input3, input4);
+ const __m128i q5 = SUB_EPI16(input2, input5);
+ const __m128i q6 = SUB_EPI16(input1, input6);
+ const __m128i q7 = SUB_EPI16(input0, input7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i r0 =
+ mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ const __m128i r1 =
+ mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ }
+ // Work on the next eight values; step1 -> odd_results
+ {
+ // step 2
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+ step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 3
+ {
+ step3_0 = ADD_EPI16(step1_0, step2_3);
+ step3_1 = ADD_EPI16(step1_1, step2_2);
+ step3_2 = SUB_EPI16(step1_1, step2_2);
+ step3_3 = SUB_EPI16(step1_0, step2_3);
+ step3_4 = SUB_EPI16(step1_7, step2_4);
+ step3_5 = SUB_EPI16(step1_6, step2_5);
+ step3_6 = ADD_EPI16(step1_6, step2_5);
+ step3_7 = ADD_EPI16(step1_7, step2_4);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
+ &step3_4, &step3_5, &step3_6, &step3_7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 4
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+ const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+ const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+ const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+ step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 5
+ {
+ step1_0 = ADD_EPI16(step3_0, step2_1);
+ step1_1 = SUB_EPI16(step3_0, step2_1);
+ step1_2 = ADD_EPI16(step3_3, step2_2);
+ step1_3 = SUB_EPI16(step3_3, step2_2);
+ step1_4 = SUB_EPI16(step3_4, step2_5);
+ step1_5 = ADD_EPI16(step3_4, step2_5);
+ step1_6 = SUB_EPI16(step3_7, step2_6);
+ step1_7 = ADD_EPI16(step3_7, step2_6);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
+ &step1_4, &step1_5, &step1_6, &step1_7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 6
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+ res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+ res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Transpose the results, do it as two 8x8 transposes.
+ transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
+ &res06, &res07, pass, out0, out1);
+ transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
+ &res14, &res15, pass, out0 + 8, out1 + 8);
+ if (pass == 0) {
+ out0 += 8 * 16;
+ } else {
+ out1 += 8 * 16;
+ }
+ }
+ // Setup in/out for next pass.
+ in = intermediate;
+ }
+}
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
new file mode 100644
index 0000000000..e14b99197f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+
+void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in0, in1;
+ __m128i tmp;
+ const __m128i zero = _mm_setzero_si128();
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(
+ in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(
+ in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+
+ tmp = _mm_add_epi16(in0, in1);
+ in0 = _mm_unpacklo_epi16(zero, tmp);
+ in1 = _mm_unpackhi_epi16(zero, tmp);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(tmp, zero);
+ in1 = _mm_unpackhi_epi32(tmp, zero);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(tmp, 8);
+
+ in1 = _mm_add_epi32(tmp, in0);
+ in0 = _mm_slli_epi32(in1, 1);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
+}
+
+void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i u0, u1, sum;
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ sum = _mm_add_epi16(u0, u1);
+
+ in0 = _mm_add_epi16(in0, in1);
+ in2 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, in0);
+
+ u0 = _mm_setzero_si128();
+ sum = _mm_add_epi16(sum, in2);
+
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
+}
+
+void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ input += 8 * stride;
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 1);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
+}
+
+void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 3);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
+}
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D vpx_fdct4x4_sse2
+#define FDCT8x8_2D vpx_fdct8x8_sse2
+#define FDCT16x16_2D vpx_fdct16x16_sse2
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
+
+#define FDCT32x32_2D vpx_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vpx_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
+#define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
+#define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
+
+#define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
new file mode 100644
index 0000000000..5aa2779706
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_epi32(a, b) \
+ _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+ __m128i buf0, buf1;
+ buf0 = _mm_mul_epu32(a, b);
+ a = _mm_srli_epi64(a, 32);
+ b = _mm_srli_epi64(b, 32);
+ buf1 = _mm_mul_epu32(a, b);
+ return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+ __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+ return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+ const __m128i *preg1) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ cmp0 = _mm_or_si128(cmp0, cmp1);
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+ _mm_cmpeq_epi16(*preg2, min_overflow));
+ __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+ _mm_cmpeq_epi16(*preg3, min_overflow));
+ cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ }
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+ const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+ const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+ const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+ const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+ const __m128i *preg30, const __m128i *preg31) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+ if (!res1)
+ res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+ }
+ }
+ }
+ }
+ }
+ return res0 + res1;
+}
+
+static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3,
+ const __m128i *zero) {
+ __m128i minus_one = _mm_set1_epi32(-1);
+ // Check for overflows
+ __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
+ __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
+ __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
+ __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
+ __m128i reg0_top_dwords =
+ _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg1_top_dwords =
+ _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg2_top_dwords =
+ _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg3_top_dwords =
+ _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
+ __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
+ __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
+ __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
+ __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
+ __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
+ int overflow_01 =
+ _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
+ int overflow_23 =
+ _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
+ return (overflow_01 + overflow_23);
+}
+
+static INLINE int k_check_epi32_overflow_8(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ }
+ return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_16(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
+ }
+ }
+ }
+ return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_32(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+ const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+ const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+ const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+ const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+ const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
+ preg27, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
+ preg31, zero);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ return overflow;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_store_si128((__m128i *)(dst_ptr), out0);
+ _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+ _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+ _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+ _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
+ const __m128i *pmultiplier,
+ const __m128i *prounding,
+ const int shift) {
+ const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
+ const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
+ const __m128i v0 = _mm_add_epi32(u0, *prounding);
+ const __m128i v1 = _mm_add_epi32(u1, *prounding);
+ const __m128i w0 = _mm_srai_epi32(v0, shift);
+ const __m128i w1 = _mm_srai_epi32(v1, shift);
+ return _mm_packs_epi32(w0, w1);
+}
+
+static INLINE void transpose_and_output8x8(
+ const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
+ const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
+ const __m128i *pin06, const __m128i *pin07, const int pass,
+ int16_t *out0_ptr, tran_low_t *out1_ptr) {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ if (pass == 0) {
+ _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
+ } else {
+ storeu_output(&tr2_0, (out1_ptr + 0 * 16));
+ storeu_output(&tr2_1, (out1_ptr + 1 * 16));
+ storeu_output(&tr2_2, (out1_ptr + 2 * 16));
+ storeu_output(&tr2_3, (out1_ptr + 3 * 16));
+ storeu_output(&tr2_4, (out1_ptr + 4 * 16));
+ storeu_output(&tr2_5, (out1_ptr + 5 * 16));
+ storeu_output(&tr2_6, (out1_ptr + 6 * 16));
+ storeu_output(&tr2_7, (out1_ptr + 7 * 16));
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
new file mode 100644
index 0000000000..2c338fb5dd
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -0,0 +1,361 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192: times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
+pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 11585, 11585
+TRANSFORM_COEFFS 15137, 6270
+TRANSFORM_COEFFS 16069, 3196
+TRANSFORM_COEFFS 9102, 13623
+
+SECTION .text
+
+%if VPX_ARCH_X86_64
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+ mova m8, [GLOBAL(pd_8192)]
+ mova m12, [GLOBAL(pw_11585x2)]
+
+ lea r3, [2 * strideq]
+ lea r4, [4 * strideq]
+ mova m0, [inputq]
+ mova m1, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m2, [inputq]
+ mova m3, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m4, [inputq]
+ mova m5, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m6, [inputq]
+ mova m7, [inputq + r3]
+
+ ; left shift by 2 to increase forward transformation precision
+ psllw m0, 2
+ psllw m1, 2
+ psllw m2, 2
+ psllw m3, 2
+ psllw m4, 2
+ psllw m5, 2
+ psllw m6, 2
+ psllw m7, 2
+
+ ; column transform
+ ; stage 1
+ paddw m10, m0, m7
+ psubw m0, m7
+
+ paddw m9, m1, m6
+ psubw m1, m6
+
+ paddw m7, m2, m5
+ psubw m2, m5
+
+ paddw m6, m3, m4
+ psubw m3, m4
+
+ ; stage 2
+ paddw m5, m9, m7
+ psubw m9, m7
+
+ paddw m4, m10, m6
+ psubw m10, m6
+
+ paddw m7, m1, m2
+ psubw m1, m2
+
+ ; stage 3
+ paddw m6, m4, m5
+ psubw m4, m5
+
+ pmulhrsw m1, m12
+ pmulhrsw m7, m12
+
+ ; sin(pi / 8), cos(pi / 8)
+ punpcklwd m2, m10, m9
+ punpckhwd m10, m9
+ pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+ pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+ pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+ pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+ paddd m5, m8
+ paddd m2, m8
+ paddd m9, m8
+ paddd m10, m8
+ psrad m5, 14
+ psrad m2, 14
+ psrad m9, 14
+ psrad m10, 14
+ packssdw m5, m9
+ packssdw m2, m10
+
+ pmulhrsw m6, m12
+ pmulhrsw m4, m12
+
+ paddw m9, m3, m1
+ psubw m3, m1
+
+ paddw m10, m0, m7
+ psubw m0, m7
+
+ ; stage 4
+ ; sin(pi / 16), cos(pi / 16)
+ punpcklwd m1, m10, m9
+ punpckhwd m10, m9
+ pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+ pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+ pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+ pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+ paddd m7, m8
+ paddd m1, m8
+ paddd m9, m8
+ paddd m10, m8
+ psrad m7, 14
+ psrad m1, 14
+ psrad m9, 14
+ psrad m10, 14
+ packssdw m7, m9
+ packssdw m1, m10
+
+ ; sin(3 * pi / 16), cos(3 * pi / 16)
+ punpcklwd m11, m0, m3
+ punpckhwd m0, m3
+ pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+ pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+ pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+ pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+ paddd m9, m8
+ paddd m11, m8
+ paddd m3, m8
+ paddd m0, m8
+ psrad m9, 14
+ psrad m11, 14
+ psrad m3, 14
+ psrad m0, 14
+ packssdw m9, m3
+ packssdw m11, m0
+
+ ; transpose
+ ; stage 1
+ punpcklwd m0, m6, m7
+ punpcklwd m3, m5, m11
+ punpckhwd m6, m7
+ punpckhwd m5, m11
+ punpcklwd m7, m4, m9
+ punpcklwd m10, m2, m1
+ punpckhwd m4, m9
+ punpckhwd m2, m1
+
+ ; stage 2
+ punpckldq m9, m0, m3
+ punpckldq m1, m6, m5
+ punpckhdq m0, m3
+ punpckhdq m6, m5
+ punpckldq m3, m7, m10
+ punpckldq m5, m4, m2
+ punpckhdq m7, m10
+ punpckhdq m4, m2
+
+ ; stage 3
+ punpcklqdq m10, m9, m3
+ punpckhqdq m9, m3
+ punpcklqdq m2, m0, m7
+ punpckhqdq m0, m7
+ punpcklqdq m3, m1, m5
+ punpckhqdq m1, m5
+ punpcklqdq m7, m6, m4
+ punpckhqdq m6, m4
+
+ ; row transform
+ ; stage 1
+ paddw m5, m10, m6
+ psubw m10, m6
+
+ paddw m4, m9, m7
+ psubw m9, m7
+
+ paddw m6, m2, m1
+ psubw m2, m1
+
+ paddw m7, m0, m3
+ psubw m0, m3
+
+ ;stage 2
+ paddw m1, m5, m7
+ psubw m5, m7
+
+ paddw m3, m4, m6
+ psubw m4, m6
+
+ paddw m7, m9, m2
+ psubw m9, m2
+
+ ; stage 3
+ punpcklwd m6, m1, m3
+ punpckhwd m1, m3
+ pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+ pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+ pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+ pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+ paddd m2, m8
+ paddd m6, m8
+ paddd m3, m8
+ paddd m1, m8
+ psrad m2, 14
+ psrad m6, 14
+ psrad m3, 14
+ psrad m1, 14
+ packssdw m2, m3
+ packssdw m6, m1
+
+ pmulhrsw m7, m12
+ pmulhrsw m9, m12
+
+ punpcklwd m3, m5, m4
+ punpckhwd m5, m4
+ pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+ pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+ pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+ pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+ paddd m1, m8
+ paddd m3, m8
+ paddd m4, m8
+ paddd m5, m8
+ psrad m1, 14
+ psrad m3, 14
+ psrad m4, 14
+ psrad m5, 14
+ packssdw m1, m4
+ packssdw m3, m5
+
+ paddw m4, m0, m9
+ psubw m0, m9
+
+ paddw m5, m10, m7
+ psubw m10, m7
+
+ ; stage 4
+ punpcklwd m9, m5, m4
+ punpckhwd m5, m4
+ pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+ pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+ pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+ pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+ paddd m7, m8
+ paddd m9, m8
+ paddd m4, m8
+ paddd m5, m8
+ psrad m7, 14
+ psrad m9, 14
+ psrad m4, 14
+ psrad m5, 14
+ packssdw m7, m4
+ packssdw m9, m5
+
+ punpcklwd m4, m10, m0
+ punpckhwd m10, m0
+ pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+ pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+ pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+ pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+ paddd m5, m8
+ paddd m4, m8
+ paddd m0, m8
+ paddd m10, m8
+ psrad m5, 14
+ psrad m4, 14
+ psrad m0, 14
+ psrad m10, 14
+ packssdw m5, m0
+ packssdw m4, m10
+
+ ; transpose
+ ; stage 1
+ punpcklwd m0, m2, m7
+ punpcklwd m10, m1, m4
+ punpckhwd m2, m7
+ punpckhwd m1, m4
+ punpcklwd m7, m6, m5
+ punpcklwd m4, m3, m9
+ punpckhwd m6, m5
+ punpckhwd m3, m9
+
+ ; stage 2
+ punpckldq m5, m0, m10
+ punpckldq m9, m2, m1
+ punpckhdq m0, m10
+ punpckhdq m2, m1
+ punpckldq m10, m7, m4
+ punpckldq m1, m6, m3
+ punpckhdq m7, m4
+ punpckhdq m6, m3
+
+ ; stage 3
+ punpcklqdq m4, m5, m10
+ punpckhqdq m5, m10
+ punpcklqdq m3, m0, m7
+ punpckhqdq m0, m7
+ punpcklqdq m10, m9, m1
+ punpckhqdq m9, m1
+ punpcklqdq m7, m2, m6
+ punpckhqdq m2, m6
+
+ psraw m1, m4, 15
+ psraw m6, m5, 15
+ psraw m8, m3, 15
+ psraw m11, m0, 15
+
+ psubw m4, m1
+ psubw m5, m6
+ psubw m3, m8
+ psubw m0, m11
+
+ psraw m4, 1
+ psraw m5, 1
+ psraw m3, 1
+ psraw m0, 1
+
+ psraw m1, m10, 15
+ psraw m6, m9, 15
+ psraw m8, m7, 15
+ psraw m11, m2, 15
+
+ psubw m10, m1
+ psubw m9, m6
+ psubw m7, m8
+ psubw m2, m11
+
+ psraw m10, 1
+ psraw m9, 1
+ psraw m7, 1
+ psraw m2, 1
+
+ mova [outputq + 0], m4
+ mova [outputq + 16], m5
+ mova [outputq + 32], m3
+ mova [outputq + 48], m0
+ mova [outputq + 64], m10
+ mova [outputq + 80], m9
+ mova [outputq + 96], m7
+ mova [outputq + 112], m2
+
+ RET
+%endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 0000000000..01a52ec8bf
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,1495 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_avx2.h"
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ (void)bd;
+
+ assert(w % 4 == 0);
+ if (w > 32) { // w = 64
+ do {
+ const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+ const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+ _mm256_storeu_si256((__m256i *)(dst + 32), p2);
+ _mm256_storeu_si256((__m256i *)(dst + 48), p3);
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 16) { // w = 32
+ do {
+ const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 8) { // w = 16
+ __m256i p0, p1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ src += src_stride;
+ p1 = _mm256_loadu_si256((const __m256i *)src);
+ src += src_stride;
+
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else if (w > 4) { // w = 8
+ __m128i p0, p1;
+ do {
+ p0 = _mm_loadu_si128((const __m128i *)src);
+ src += src_stride;
+ p1 = _mm_loadu_si128((const __m128i *)src);
+ src += src_stride;
+
+ _mm_storeu_si128((__m128i *)dst, p0);
+ dst += dst_stride;
+ _mm_storeu_si128((__m128i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else { // w = 4
+ __m128i p0, p1;
+ do {
+ p0 = _mm_loadl_epi64((const __m128i *)src);
+ src += src_stride;
+ p1 = _mm_loadl_epi64((const __m128i *)src);
+ src += src_stride;
+
+ _mm_storel_epi64((__m128i *)dst, p0);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ }
+}
+
+void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ (void)bd;
+
+ assert(w % 4 == 0);
+ if (w > 32) { // w = 64
+ __m256i p0, p1, p2, p3, u0, u1, u2, u3;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+ p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+ src += src_stride;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+ u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
+ u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+ _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
+ _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 16) { // w = 32
+ __m256i p0, p1, u0, u1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ src += src_stride;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 8) { // w = 16
+ __m256i p0, p1, u0, u1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
+
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + dst_stride),
+ _mm256_avg_epu16(p1, u1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ } else if (w > 4) { // w = 8
+ __m128i p0, p1, u0, u1;
+ do {
+ p0 = _mm_loadu_si128((const __m128i *)src);
+ p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm_loadu_si128((const __m128i *)dst);
+ u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
+
+ _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
+ _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ } else { // w = 4
+ __m128i p0, p1, u0, u1;
+ do {
+ p0 = _mm_loadl_epi64((const __m128i *)src);
+ p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm_loadl_epi64((const __m128i *)dst);
+ u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
+
+ _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Horizontal and vertical filtering
+
+static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+ 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+ 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13,
+ 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13 };
+
+static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15,
+ 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15 };
+
+static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
+
+#define CONV8_ROUNDING_BITS (7)
+#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
+
+// -----------------------------------------------------------------------------
+// Horizontal Filtering
+
+static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
+ const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
+ const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
+
+ p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6
+ p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7
+ p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4
+ p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5
+}
+
+// Note:
+// Shared by 8x2 and 16x1 block
+static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *x /*x[8]*/) {
+ __m256i pp[8];
+ pack_pixels(s0, pp);
+ pack_pixels(s1, &pp[4]);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
+ x[4] = x[2];
+ x[5] = x[3];
+ x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
+ x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
+}
+
+static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
+ __m256i pp[8];
+ __m256i s0;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ pack_pixels(&s0, pp);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
+}
+
+static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
+ __m256i *x) {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
+ pack_16_pixels(&s0, &s1, x);
+}
+
+static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_pixels(&s0, &s1, x);
+}
+
+// Note:
+// Shared by horizontal and vertical filtering
+static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p0 = _mm256_set1_epi32(0x03020100);
+ const __m256i p1 = _mm256_set1_epi32(0x07060504);
+ const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
+ const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
+ f[0] = _mm256_shuffle_epi8(hh, p0);
+ f[1] = _mm256_shuffle_epi8(hh, p1);
+ f[2] = _mm256_shuffle_epi8(hh, p2);
+ f[3] = _mm256_shuffle_epi8(hh, p3);
+}
+
+static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+ const __m256i *fil /*fil[4]*/,
+ __m256i *y) {
+ __m256i a, a0, a1;
+
+ a0 = _mm256_madd_epi16(fil[0], sig[0]);
+ a1 = _mm256_madd_epi16(fil[3], sig[3]);
+ a = _mm256_add_epi32(a0, a1);
+
+ a0 = _mm256_madd_epi16(fil[1], sig[1]);
+ a1 = _mm256_madd_epi16(fil[2], sig[2]);
+
+ {
+ const __m256i min = _mm256_min_epi32(a0, a1);
+ a = _mm256_add_epi32(a, min);
+ }
+ {
+ const __m256i max = _mm256_max_epi32(a0, a1);
+ a = _mm256_add_epi32(a, max);
+ }
+ {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ a = _mm256_add_epi32(a, rounding);
+ *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
+ }
+}
+
+static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
+ uint16_t *dst) {
+ const __m128i a0 = _mm256_castsi256_si128(*y);
+ const __m128i a1 = _mm256_extractf128_si256(*y, 1);
+ __m128i res = _mm_packus_epi32(a0, a1);
+ res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+ _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static void vpx_highbd_filter_block1d8_h8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_pixels(src_ptr, src_pitch, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ store_8x1_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap horizontal filtering
+
+static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p = _mm256_set1_epi32(0x09080706);
+ f[0] = _mm256_shuffle_epi8(hh, p);
+}
+
+// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
+// the difference is s0/s1 specifies first and second rows or,
+// first 16 samples and 8-sample shifted 16 samples
+static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *sig) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
+ __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
+ __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
+ __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ r1 = _mm256_shuffle_epi8(r1, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
+ sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
+}
+
+static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+ const ptrdiff_t pitch, __m256i *sig) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
+ r0 = _mm256_permutevar8x32_epi32(r0, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
+}
+
+// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
+static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ __m256i x1 = _mm256_madd_epi16(sig[1], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ x1 = _mm256_add_epi32(x1, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+}
+
+static void vpx_highbd_filter_block1d8_h2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_2t_pixels(src_ptr, signal);
+ filter_8x1_2t_pixels(signal, &ff, &res0);
+ store_8x1_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_2t_pixels(src_ptr, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// Vertical Filtering
+
+static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
+ __m256i s1 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
+ __m256i s2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
+ __m256i s3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
+ __m256i s4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
+ __m256i s5 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
+ __m256i s6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
+
+ s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
+ s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
+ s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
+ s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
+ s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
+
+ sig[0] = _mm256_unpacklo_epi16(s0, s1);
+ sig[4] = _mm256_unpackhi_epi16(s0, s1);
+ sig[1] = _mm256_unpacklo_epi16(s2, s3);
+ sig[5] = _mm256_unpackhi_epi16(s2, s3);
+ sig[2] = _mm256_unpacklo_epi16(s4, s5);
+ sig[6] = _mm256_unpackhi_epi16(s4, s5);
+ sig[8] = s6;
+}
+
+static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ __m256i s0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
+ // base + 8th row
+ __m256i s1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
+ __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
+ __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ sig[3] = _mm256_unpacklo_epi16(s2, s3);
+ sig[7] = _mm256_unpackhi_epi16(s2, s3);
+ sig[8] = s1;
+}
+
+static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_8x1_pixels(sig, f, y0);
+ filter_8x1_pixels(&sig[4], f, y1);
+}
+
+static INLINE void update_pixels(__m256i *sig) {
+ int i;
+ for (i = 0; i < 3; ++i) {
+ sig[i] = sig[i + 1];
+ sig[i + 4] = sig[i + 5];
+ }
+}
+
+static void vpx_highbd_filter_block1d8_v8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[9], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_8x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+ filter_8x9_pixels(signal, ff, &res0, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i u0, u1, u2, u3;
+ // load 0-6 rows
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
+ const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
+ const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
+ const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
+ const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
+
+ u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low
+ u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high
+
+ u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low
+ u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high
+
+ sig[0] = _mm256_unpacklo_epi16(u0, u2);
+ sig[4] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[8] = _mm256_unpacklo_epi16(u1, u3);
+ sig[12] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
+ u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
+ u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
+
+ sig[1] = _mm256_unpacklo_epi16(u0, u2);
+ sig[5] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[9] = _mm256_unpacklo_epi16(u1, u3);
+ sig[13] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
+ u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
+ u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
+
+ sig[2] = _mm256_unpacklo_epi16(u0, u2);
+ sig[6] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[10] = _mm256_unpacklo_epi16(u1, u3);
+ sig[14] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s6;
+}
+
+static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
+ // base + 8th row
+ const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
+
+ __m256i u0, u1, u2, u3;
+ u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
+ u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
+ u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
+
+ sig[3] = _mm256_unpacklo_epi16(u0, u2);
+ sig[7] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[11] = _mm256_unpacklo_epi16(u1, u3);
+ sig[15] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s8;
+}
+
+static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ __m256i res[4];
+ int i;
+ for (i = 0; i < 4; ++i) {
+ filter_8x1_pixels(&sig[i << 2], f, &res[i]);
+ }
+
+ {
+ const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
+ const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
+ *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
+ *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
+ }
+}
+
+static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i p = _mm256_min_epi16(*y0, *mask);
+ _mm256_storeu_si256((__m256i *)dst, p);
+ p = _mm256_min_epi16(*y1, *mask);
+ _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static void update_16x9_pixels(__m256i *sig) {
+ update_pixels(&sig[0]);
+ update_pixels(&sig[8]);
+}
+
+static void vpx_highbd_filter_block1d16_v8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[17], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_16x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_16x9_pixels(src_ptr, src_pitch, signal);
+ filter_16x9_pixels(signal, ff, &res0, &res1);
+ store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_16x9_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap vertical filtering
+
+static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
+ sig[2] = _mm256_loadu_si256((const __m256i *)src);
+}
+
+static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // load the next row
+ const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ sig[0] = _mm256_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm256_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static void vpx_highbd_filter_block1d16_v2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[3], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+ __m256i ff;
+
+ pack_2t_filter(filter, &ff);
+ pack_16x2_init(src_ptr, signal);
+
+ do {
+ pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i p = _mm_set1_epi32(0x09080706);
+ f[0] = _mm_shuffle_epi8(h, p);
+}
+
+static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
+ sig[2] = _mm_loadu_si128((const __m128i *)src);
+}
+
+static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+ __m128i *sig) {
+ // load the next row
+ const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
+ sig[0] = _mm_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+ __m128i *y0, __m128i *y1) {
+ const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m128i x0 = _mm_madd_epi16(sig[0], *f);
+ __m128i x1 = _mm_madd_epi16(sig[1], *f);
+ x0 = _mm_add_epi32(x0, rounding);
+ x1 = _mm_add_epi32(x1, rounding);
+ *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+ const __m128i *mask, uint16_t *dst) {
+ __m128i res = _mm_packus_epi32(*y0, *y1);
+ res = _mm_min_epi16(res, *mask);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void vpx_highbd_filter_block1d8_v2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m128i signal[3], res0, res1;
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ __m128i ff;
+
+ pack_8x1_2t_filter(filter, &ff);
+ pack_8x2_init(src_ptr, signal);
+
+ do {
+ pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+ filter_8_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+// Calculation with averaging the input pixels
+
+static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
+ uint16_t *dst) {
+ const __m128i a0 = _mm256_castsi256_si128(*y0);
+ const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
+ __m128i res = _mm_packus_epi32(a0, a1);
+ const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+ res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+ res = _mm_avg_epu16(res, pix);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
+ const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
+ const __m256i pix =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+ a = _mm256_min_epi16(a, *mask);
+ a = _mm256_avg_epu16(a, pix);
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+ _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
+ a = _mm256_min_epi16(a, *mask);
+ a = _mm256_avg_epu16(a, pix);
+ _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
+ const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
+ __m256i p = _mm256_min_epi16(*y0, *mask);
+ p = _mm256_avg_epu16(p, pix0);
+ _mm256_storeu_si256((__m256i *)dst, p);
+
+ p = _mm256_min_epi16(*y1, *mask);
+ p = _mm256_avg_epu16(p, pix1);
+ _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
+ const __m128i *y1,
+ const __m128i *mask,
+ uint16_t *dst) {
+ __m128i res = _mm_packus_epi32(*y0, *y1);
+ const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+ res = _mm_min_epi16(res, *mask);
+ res = _mm_avg_epu16(res, pix);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void vpx_highbd_filter_block1d8_h8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_pixels(src_ptr, src_pitch, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ store_8x1_avg_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d4_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We extract the middle four elements of the kernel into two registers in
+ // the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add on the two
+ // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
+ // can do this two rows at a time.
+
+ __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m256i res_reg;
+ __m256i idx_shift_0 =
+ _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
+ 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
+ __m256i idx_shift_2 =
+ _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
+ 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+ const __m256i reg_round =
+ _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+ int h;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+ kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+ for (h = height; h >= 2; h -= 2) {
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Get the output
+ res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Round the result
+ res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Finally combine to get the final dst
+ res_reg = _mm256_packus_epi32(res_reg, res_reg);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ // Repeat for the last row if needed
+ if (h > 0) {
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Get the output
+ res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Round the result
+ res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Finally combine to get the final dst
+ res_reg = _mm256_packus_epi32(res_reg, res_reg);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+ _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
+ }
+}
+
+static void vpx_highbd_filter_block1d8_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will extract the middle four elements of the kernel into two registers
+ // in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum of the first half.
+ // Calling add gives us first half of the output. Repat again to get the whole
+ // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
+ // at a time.
+
+ __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m256i res_reg, res_first, res_last;
+ __m256i idx_shift_0 =
+ _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
+ 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
+ __m256i idx_shift_2 =
+ _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
+ 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+ const __m256i reg_round =
+ _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+ int h;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+ kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+ for (h = height; h >= 2; h -= 2) {
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Result for first half
+ res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Do again to get the second half of dst
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Result for second half
+ res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Round each result
+ res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
+ res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Finally combine to get the final dst
+ res_reg = _mm256_packus_epi32(res_first, res_last);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+ mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ // Repeat for the last row if needed
+ if (h > 0) {
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
+
+ res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+ res_reg = _mm256_packus_epi32(res_reg, res_reg);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, kernel, bd);
+ vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
+ dst_stride, height, kernel, bd);
+}
+
+static void vpx_highbd_filter_block1d8_v8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[9], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_8x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+ filter_8x9_pixels(signal, ff, &res0, &res1);
+ store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d16_v8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[17], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_16x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_16x9_pixels(src_ptr, src_pitch, signal);
+ filter_16x9_pixels(signal, ff, &res0, &res1);
+ store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_16x9_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_h2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_2t_pixels(src_ptr, signal);
+ filter_8x1_2t_pixels(signal, &ff, &res0);
+ store_8x1_avg_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_2t_pixels(src_ptr, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d16_v2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[3], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+ __m256i ff;
+
+ pack_2t_filter(filter, &ff);
+ pack_16x2_init(src_ptr, signal);
+
+ do {
+ pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_v2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m128i signal[3], res0, res1;
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ __m128i ff;
+
+ pack_8x1_2t_filter(filter, &ff);
+ pack_8x2_init(src_ptr, signal);
+
+ do {
+ pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+ filter_8_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d4_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load two rows of pixels and rearrange them into the form
+ // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call add with another row to get the output.
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001, src_reg_1223;
+
+ // Result after multiply and add
+ __m256i res_reg;
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel used
+
+ const __m256i reg_round =
+ _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+ kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
+
+ // Output
+ res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Round the words
+ res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi32(res_reg, res_reg);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+ // Save the result
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001 = src_reg_1223;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_highbd_filter_block1d8_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load two rows of pixels and rearrange them into the form
+ // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call add with another row to get the output.
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel
+
+ // Result after multiply and add
+ __m256i res_reg, res_reg_lo, res_reg_hi;
+
+ const __m256i reg_round =
+ _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+ kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
+ src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
+ src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
+
+ // Output from first half
+ res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Output from second half
+ res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Round the words
+ res_reg_lo =
+ mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg_hi =
+ mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+ // Save the result
+ mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001_lo = src_reg_1223_lo;
+ src_reg_m1001_hi = src_reg_1223_hi;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_highbd_filter_block1d16_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, kernel, bd);
+ vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
+ dst_stride, height, kernel, bd);
+}
+
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+
+#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
+#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
+#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
+#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_highbd_filter_block1d16_v4_avg_avx2 \
+ vpx_highbd_filter_block1d16_v8_avg_avx2
+#define vpx_highbd_filter_block1d16_h4_avg_avx2 \
+ vpx_highbd_filter_block1d16_h8_avg_avx2
+#define vpx_highbd_filter_block1d8_v4_avg_avx2 \
+ vpx_highbd_filter_block1d8_v8_avg_avx2
+#define vpx_highbd_filter_block1d8_h4_avg_avx2 \
+ vpx_highbd_filter_block1d8_h8_avg_avx2
+#define vpx_highbd_filter_block1d4_v4_avg_avx2 \
+ vpx_highbd_filter_block1d4_v8_avg_avx2
+#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
+ vpx_highbd_filter_block1d4_h8_avg_avx2
+
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), , avx2, 0)
+HIGH_FUN_CONV_2D(, avx2, 0)
+
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+
+#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
+ vpx_highbd_filter_block1d4_h8_avg_sse2
+#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
+ vpx_highbd_filter_block1d4_h2_avg_sse2
+#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
+ vpx_highbd_filter_block1d4_v8_avg_sse2
+#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
+ vpx_highbd_filter_block1d4_v2_avg_sse2
+
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
+HIGH_FUN_CONV_2D(avg_, avx2, 1)
+
+#undef HIGHBD_FUNC
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
new file mode 100644
index 0000000000..f4f7235d13
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+ __m128i *const out) {
+ // stage 5
+ out[0] = _mm_add_epi32(in[0], in[3]);
+ out[1] = _mm_add_epi32(in[1], in[2]);
+ out[2] = _mm_sub_epi32(in[1], in[2]);
+ out[3] = _mm_sub_epi32(in[0], in[3]);
+ highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
+ out[8] = _mm_add_epi32(in[8], in[11]);
+ out[9] = _mm_add_epi32(in[9], in[10]);
+ out[10] = _mm_sub_epi32(in[9], in[10]);
+ out[11] = _mm_sub_epi32(in[8], in[11]);
+ out[12] = _mm_sub_epi32(in[15], in[12]);
+ out[13] = _mm_sub_epi32(in[14], in[13]);
+ out[14] = _mm_add_epi32(in[14], in[13]);
+ out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[7]);
+ out[1] = _mm_add_epi32(in[1], in[6]);
+ out[2] = _mm_add_epi32(in[2], in[5]);
+ out[3] = _mm_add_epi32(in[3], in[4]);
+ out[4] = _mm_sub_epi32(in[3], in[4]);
+ out[5] = _mm_sub_epi32(in[2], in[5]);
+ out[6] = _mm_sub_epi32(in[1], in[6]);
+ out[7] = _mm_sub_epi32(in[0], in[7]);
+ out[8] = in[8];
+ out[9] = in[9];
+ highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
+ highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
+ out[14] = in[14];
+ out[15] = in[15];
+}
+
+static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
+ highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp1[2], sign[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ abs_extend_64bit_sse2(io[0], temp1, sign);
+ step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+ step2[1] = step2[0];
+ highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp[2], sign[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10]
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13]
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ abs_extend_64bit_sse2(io[0], temp, sign);
+ step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64);
+ step2[1] = step2[0];
+ step2[2] = _mm_setzero_si128();
+ step2[3] = _mm_setzero_si128();
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16], *in;
+
+ if (bd == 8) {
+ __m128i l[16], r[16];
+
+ in = l;
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+ idct16_8col(in, in);
+ in = r;
+ input += 128;
+ }
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ idct16_8col(out, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[4][16];
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+ highbd_idct16_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct16_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16];
+
+ if (bd == 8) {
+ __m128i in[16], temp[16];
+
+ highbd_load_pack_transpose_32bit_8x8(input, 16, in);
+ for (i = 8; i < 16; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+ idct16_8col(in, temp);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(temp + i, in);
+ idct16_8col(in, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(input, 16, in);
+ highbd_idct16x16_38_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ highbd_idct16x16_38_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16];
+
+ if (bd == 8) {
+ __m128i in[16], l[16];
+
+ in[0] = load_pack_8_32bit(input + 0 * 16);
+ in[1] = load_pack_8_32bit(input + 1 * 16);
+ in[2] = load_pack_8_32bit(input + 2 * 16);
+ in[3] = load_pack_8_32bit(input + 3 * 16);
+
+ idct16x16_10_pass1(in, l);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ idct16x16_10_pass2(l + i, in);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, in[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_4x4(input, 16, in);
+ highbd_idct16x16_10_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(&all[0][i], out);
+ highbd_idct16x16_10_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
new file mode 100644
index 0000000000..7898ee12c8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+ __m128i *const out) {
+ // stage 5
+ out[0] = _mm_add_epi32(in[0], in[3]);
+ out[1] = _mm_add_epi32(in[1], in[2]);
+ out[2] = _mm_sub_epi32(in[1], in[2]);
+ out[3] = _mm_sub_epi32(in[0], in[3]);
+ highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
+ out[8] = _mm_add_epi32(in[8], in[11]);
+ out[9] = _mm_add_epi32(in[9], in[10]);
+ out[10] = _mm_sub_epi32(in[9], in[10]);
+ out[11] = _mm_sub_epi32(in[8], in[11]);
+ out[12] = _mm_sub_epi32(in[15], in[12]);
+ out[13] = _mm_sub_epi32(in[14], in[13]);
+ out[14] = _mm_add_epi32(in[14], in[13]);
+ out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[7]);
+ out[1] = _mm_add_epi32(in[1], in[6]);
+ out[2] = _mm_add_epi32(in[2], in[5]);
+ out[3] = _mm_add_epi32(in[3], in[4]);
+ out[4] = _mm_sub_epi32(in[3], in[4]);
+ out[5] = _mm_sub_epi32(in[2], in[5]);
+ out[6] = _mm_sub_epi32(in[1], in[6]);
+ out[7] = _mm_sub_epi32(in[0], in[7]);
+ out[8] = in[8];
+ out[9] = in[9];
+ highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
+ highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
+ out[14] = in[14];
+ out[15] = in[15];
+}
+
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
+ highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp1[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ extend_64bit(io[0], temp1);
+ step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+ step2[1] = step2[0];
+ highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ extend_64bit(io[0], temp);
+ step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+ step2[1] = step2[0];
+ step2[2] = _mm_setzero_si128();
+ step2[3] = _mm_setzero_si128();
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd) {
+ int i;
+ __m128i out[16], *in;
+
+ if (bd == 8) {
+ __m128i l[16], r[16];
+
+ in = l;
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+ idct16_8col(in, in);
+ in = r;
+ input += 128;
+ }
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ idct16_8col(out, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[4][16];
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+ vpx_highbd_idct16_4col_sse4_1(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ vpx_highbd_idct16_4col_sse4_1(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16];
+
+ if (bd == 8) {
+ __m128i in[16], temp[16];
+
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ for (i = 8; i < 16; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+ idct16_8col(in, temp);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(temp + i, in);
+ idct16_8col(in, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(input, 16, in);
+ highbd_idct16x16_38_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ highbd_idct16x16_38_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16];
+
+ if (bd == 8) {
+ __m128i in[16], l[16];
+
+ in[0] = load_pack_8_32bit(input + 0 * 16);
+ in[1] = load_pack_8_32bit(input + 1 * 16);
+ in[2] = load_pack_8_32bit(input + 2 * 16);
+ in[3] = load_pack_8_32bit(input + 3 * 16);
+
+ idct16x16_10_pass1(in, l);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ idct16x16_10_pass2(l + i, in);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, in[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_4x4(input, 16, in);
+ highbd_idct16x16_10_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(&all[0][i], out);
+ highbd_idct16x16_10_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
new file mode 100644
index 0000000000..c710e89954
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
@@ -0,0 +1,782 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+ __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm_add_epi32(step2[8], step2[11]);
+ step1[9] = _mm_add_epi32(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+ step1[14] = _mm_add_epi32(step2[14], step2[13]);
+ step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ highbd_butterfly_sse2(step1[13], step1[10], cospi_16_64, cospi_16_64,
+ &out[10], &out[13]);
+ highbd_butterfly_sse2(step1[12], step1[11], cospi_16_64, cospi_16_64,
+ &out[11], &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+ __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[16] = _mm_add_epi32(step1[16], step1[19]);
+ step2[17] = _mm_add_epi32(step1[17], step1[18]);
+ step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+ step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+ step2[20] = _mm_sub_epi32(step1[20], step1[23]); // step2[20] = -step2[20]
+ step2[21] = _mm_sub_epi32(step1[21], step1[22]); // step2[21] = -step2[21]
+ step2[22] = _mm_add_epi32(step1[21], step1[22]);
+ step2[23] = _mm_add_epi32(step1[20], step1[23]);
+
+ step2[24] = _mm_add_epi32(step1[27], step1[24]);
+ step2[25] = _mm_add_epi32(step1[26], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[26], step1[25]); // step2[26] = -step2[26]
+ step2[27] = _mm_sub_epi32(step1[27], step1[24]); // step2[27] = -step2[27]
+ step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+ step2[30] = _mm_add_epi32(step1[29], step1[30]);
+ step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ highbd_butterfly_sse2(step2[29], step2[18], cospi_24_64, cospi_8_64,
+ &step1[18], &step1[29]);
+ highbd_butterfly_sse2(step2[28], step2[19], cospi_24_64, cospi_8_64,
+ &step1[19], &step1[28]);
+ highbd_butterfly_sse2(step2[20], step2[27], cospi_8_64, cospi_24_64,
+ &step1[27], &step1[20]);
+ highbd_butterfly_sse2(step2[21], step2[26], cospi_8_64, cospi_24_64,
+ &step1[26], &step1[21]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[16] = _mm_add_epi32(step1[16], step1[23]);
+ step2[17] = _mm_add_epi32(step1[17], step1[22]);
+ step2[18] = _mm_add_epi32(step1[18], step1[21]);
+ step2[19] = _mm_add_epi32(step1[19], step1[20]);
+ step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+ step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+ step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+ step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+ step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+ step2[28] = _mm_add_epi32(step1[27], step1[28]);
+ step2[29] = _mm_add_epi32(step1[26], step1[29]);
+ step2[30] = _mm_add_epi32(step1[25], step1[30]);
+ step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+ // stage 7
+ out[16] = step2[16];
+ out[17] = step2[17];
+ out[18] = step2[18];
+ out[19] = step2[19];
+ highbd_butterfly_sse2(step2[27], step2[20], cospi_16_64, cospi_16_64,
+ &out[20], &out[27]);
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_16_64, cospi_16_64,
+ &out[21], &out[26]);
+ highbd_butterfly_sse2(step2[25], step2[22], cospi_16_64, cospi_16_64,
+ &out[22], &out[25]);
+ highbd_butterfly_sse2(step2[24], step2[23], cospi_16_64, cospi_16_64,
+ &out[23], &out[24]);
+ out[28] = step2[28];
+ out[29] = step2[29];
+ out[30] = step2[30];
+ out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_butterfly_sse2(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse2(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_butterfly_sse2(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_butterfly_sse2(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_butterfly_sse2(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse2(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse2(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_1024_4x32_quarter_1(in, temp);
+ highbd_idct32_1024_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_butterfly_sse2(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_butterfly_sse2(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+ &step1[30]);
+ highbd_butterfly_sse2(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_butterfly_sse2(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_butterfly_sse2(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_butterfly_sse2(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+ &step1[26]);
+
+ highbd_butterfly_sse2(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_butterfly_sse2(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18]
+ step2[19] = _mm_add_epi32(step1[18], step1[19]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22]
+ step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+ step2[24] = _mm_add_epi32(step1[25], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25]
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[29], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29]
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+ &step1[29], &step1[18]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+ &step1[25], &step1[22]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+ highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[4][32], io[32];
+
+ // rows
+ for (i = 0; i < 4; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+ highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+ highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+ idct32_1024_8x32(io, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ // Transpose 32x8 block to 8x32 block
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ transpose_16bit_8x8(col[2] + i, io + 16);
+ transpose_16bit_8x8(col[3] + i, io + 24);
+ idct32_1024_8x32(io, io);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, io[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 8; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+ highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+ highbd_idct32_1024_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ transpose_32bit_4x4(all[4] + i, out + 16);
+ transpose_32bit_4x4(all[5] + i, out + 20);
+ transpose_32bit_4x4(all[6] + i, out + 24);
+ transpose_32bit_4x4(all[7] + i, out + 28);
+ highbd_idct32_1024_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_neg_sse2(in[12], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_partial_butterfly_sse2(in[8], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(in[14], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse2(in[10], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_135_4x32_quarter_1(in, temp);
+ highbd_idct32_135_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_neg_sse2(in[15], cospi_15_64, cospi_17_64,
+ &step1[17], &step1[30]);
+ highbd_partial_butterfly_sse2(in[9], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_neg_sse2(in[11], cospi_11_64, cospi_21_64,
+ &step1[21], &step1[26]);
+
+ highbd_partial_butterfly_sse2(in[13], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18]
+ step2[19] = _mm_add_epi32(step1[18], step1[19]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22]
+ step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+ step2[24] = _mm_add_epi32(step1[25], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25]
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[29], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29]
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+ &step1[29], &step1[18]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+ &step1[25], &step1[22]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_135_4x32_quarter_1_2(io, temp);
+ highbd_idct32_135_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[2][32], in[32], out[32];
+
+ for (i = 16; i < 32; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+
+ // rows
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+ idct32_1024_8x32(in, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, in);
+ transpose_16bit_8x8(col[1] + i, in + 8);
+ idct32_1024_8x32(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_135_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_135_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+
+ // stage 4
+ highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ step1[10] =
+ _mm_sub_epi32(_mm_setzero_si128(), step1[10]); // step1[10] = -step1[10]
+ step1[13] =
+ _mm_sub_epi32(_mm_setzero_si128(), step1[13]); // step1[13] = -step1[13]
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_34_4x32_quarter_1(in, temp);
+ highbd_idct32_34_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = step1[16];
+ step2[17] = step1[16];
+ step2[18] = step1[19];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[21] = step1[20];
+ step2[22] = step1[23];
+ step2[23] = step1[23];
+
+ step2[24] = step1[24];
+ step2[25] = step1[24];
+ step2[26] = step1[27];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[29] = step1[28];
+ step2[30] = step1[31];
+ step2[31] = step1[31];
+
+ // stage 3
+ step2[18] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[18]); // step2[18] = -step2[18]
+ step2[22] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[22]); // step2[22] = -step2[22]
+ step2[25] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[25]); // step2[25] = -step2[25]
+ step2[29] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[29]); // step2[29] = -step2[29]
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+ &step1[29], &step1[18]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+ &step1[25], &step1[22]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_34_4x32_quarter_1_2(io, temp);
+ highbd_idct32_34_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[32], in[32], out[32];
+
+ // rows
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ idct32_34_8x32_sse2(in, col);
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col + i, in);
+ idct32_34_8x32_sse2(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_34_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_34_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ highbd_idct_1_add_kernel(input, dest, stride, bd, 32);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
new file mode 100644
index 0000000000..2d0a53ac0a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
@@ -0,0 +1,765 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+ __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64,
+ &step2[10], &step2[13]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm_add_epi32(step2[8], step2[11]);
+ step1[9] = _mm_add_epi32(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+ step1[14] = _mm_add_epi32(step2[14], step2[13]);
+ step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64,
+ &out[10], &out[13]);
+ highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64,
+ &out[11], &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+ __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[16] = _mm_add_epi32(step1[16], step1[19]);
+ step2[17] = _mm_add_epi32(step1[17], step1[18]);
+ step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+ step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+ step2[20] = _mm_sub_epi32(step1[23], step1[20]);
+ step2[21] = _mm_sub_epi32(step1[22], step1[21]);
+ step2[22] = _mm_add_epi32(step1[22], step1[21]);
+ step2[23] = _mm_add_epi32(step1[23], step1[20]);
+
+ step2[24] = _mm_add_epi32(step1[24], step1[27]);
+ step2[25] = _mm_add_epi32(step1[25], step1[26]);
+ step2[26] = _mm_sub_epi32(step1[25], step1[26]);
+ step2[27] = _mm_sub_epi32(step1[24], step1[27]);
+ step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+ step2[30] = _mm_add_epi32(step1[29], step1[30]);
+ step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64,
+ &step1[18], &step1[29]);
+ highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64,
+ &step1[19], &step1[28]);
+ highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64,
+ &step1[20], &step1[27]);
+ highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64,
+ &step1[21], &step1[26]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[16] = _mm_add_epi32(step1[16], step1[23]);
+ step2[17] = _mm_add_epi32(step1[17], step1[22]);
+ step2[18] = _mm_add_epi32(step1[18], step1[21]);
+ step2[19] = _mm_add_epi32(step1[19], step1[20]);
+ step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+ step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+ step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+ step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+ step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+ step2[28] = _mm_add_epi32(step1[27], step1[28]);
+ step2[29] = _mm_add_epi32(step1[26], step1[29]);
+ step2[30] = _mm_add_epi32(step1[25], step1[30]);
+ step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+ // stage 7
+ out[16] = step2[16];
+ out[17] = step2[17];
+ out[18] = step2[18];
+ out[19] = step2[19];
+ highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64,
+ &out[20], &out[27]);
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64,
+ &out[21], &out[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64,
+ &out[22], &out[25]);
+ highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64,
+ &out[23], &out[24]);
+ out[28] = step2[28];
+ out[29] = step2[29];
+ out[30] = step2[30];
+ out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+ &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_1024_4x32_quarter_1(in, temp);
+ highbd_idct32_1024_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+ &step1[30]);
+ highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+ &step1[26]);
+
+ highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+ step2[19] = _mm_add_epi32(step1[19], step1[18]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+ step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi32(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+ &step1[18], &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+ &step1[22], &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+ highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[4][32], io[32];
+
+ // rows
+ for (i = 0; i < 4; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+ highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+ highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+ idct32_1024_8x32(io, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ // Transpose 32x8 block to 8x32 block
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ transpose_16bit_8x8(col[2] + i, io + 16);
+ transpose_16bit_8x8(col[3] + i, io + 24);
+ idct32_1024_8x32(io, io);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, io[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 8; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+ highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+ highbd_idct32_1024_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ transpose_32bit_4x4(all[4] + i, out + 16);
+ transpose_32bit_4x4(all[5] + i, out + 20);
+ transpose_32bit_4x4(all[6] + i, out + 24);
+ transpose_32bit_4x4(all[7] + i, out + 28);
+ highbd_idct32_1024_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+ &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_135_4x32_quarter_1(in, temp);
+ highbd_idct32_135_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+ &step1[30]);
+ highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+ &step1[26]);
+
+ highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+ step2[19] = _mm_add_epi32(step1[19], step1[18]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+ step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi32(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+ &step1[18], &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+ &step1[22], &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_135_4x32_quarter_1_2(io, temp);
+ highbd_idct32_135_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[2][32], in[32], out[32];
+
+ // rows
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+ idct32_135_8x32_ssse3(in, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, in);
+ transpose_16bit_8x8(col[1] + i, in + 8);
+ idct32_135_8x32_ssse3(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_135_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_135_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+
+ // stage 4
+ highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+ &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_34_4x32_quarter_1(in, temp);
+ highbd_idct32_34_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = step1[16];
+ step2[17] = step1[16];
+ step2[18] = step1[19];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[21] = step1[20];
+ step2[22] = step1[23];
+ step2[23] = step1[23];
+
+ step2[24] = step1[24];
+ step2[25] = step1[24];
+ step2[26] = step1[27];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[29] = step1[28];
+ step2[30] = step1[31];
+ step2[31] = step1[31];
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+ &step1[18], &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+ &step1[22], &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_34_4x32_quarter_1_2(io, temp);
+ highbd_idct32_34_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[32], in[32], out[32];
+
+ // rows
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ idct32_34_8x32_ssse3(in, col);
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col + i, in);
+ idct32_34_8x32_ssse3(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_34_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_34_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
new file mode 100644
index 0000000000..b9c8884f99
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0,
+ const __m128i in1) {
+ const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 1
+ const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 2, 3
+ const __m128i t2 = _mm_unpacklo_epi64(t0, t1); // 0, 1, 2, 3
+ return dct_const_round_shift_sse2(t2);
+}
+
+static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
+ const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0);
+ const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0);
+ const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0);
+ __m128i temp1[4], temp2[4], step[4];
+
+ transpose_32bit_4x4(io, io);
+
+ // Note: There is no 32-bit signed multiply SIMD instruction in SSE2.
+ // _mm_mul_epu32() is used which can only guarantee the lower 32-bit
+ // (signed) result is meaningful, which is enough in this function.
+
+ // stage 1
+ temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
+ temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
+ temp1[1] = _mm_srli_si128(temp1[0], 4); // 1, 3
+ temp2[1] = _mm_srli_si128(temp2[0], 4); // 1, 3
+ temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16); // ([0] + [2])*cospi_16_64
+ temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16); // ([0] + [2])*cospi_16_64
+ temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16); // ([0] - [2])*cospi_16_64
+ temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16); // ([0] - [2])*cospi_16_64
+ step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+ step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+ temp1[3] = _mm_srli_si128(io[1], 4);
+ temp2[3] = _mm_srli_si128(io[3], 4);
+ temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24); // input[1] * cospi_24_64
+ temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24); // input[1] * cospi_24_64
+ temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08); // input[1] * cospi_8_64
+ temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08); // input[1] * cospi_8_64
+ temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08); // input[3] * cospi_8_64
+ temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08); // input[3] * cospi_8_64
+ temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24); // input[3] * cospi_24_64
+ temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24); // input[3] * cospi_24_64
+ temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]); // [1]*cospi_24 - [3]*cospi_8
+ temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]); // [1]*cospi_24 - [3]*cospi_8
+ temp2[0] = _mm_add_epi64(temp2[0], temp2[2]); // [1]*cospi_8 + [3]*cospi_24
+ temp2[1] = _mm_add_epi64(temp2[1], temp2[3]); // [1]*cospi_8 + [3]*cospi_24
+ step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+ step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
+
+static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
+ __m128i step[4];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]);
+ highbd_butterfly_sse2(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+ &step[3]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
+
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int16_t max = 0, min = 0;
+ __m128i io[4], io_short[2];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 8));
+ io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+ io_short[0] = _mm_packs_epi32(io[0], io[1]);
+ io_short[1] = _mm_packs_epi32(io[2], io[3]);
+
+ if (bd != 8) {
+ __m128i max_input, min_input;
+
+ max_input = _mm_max_epi16(io_short[0], io_short[1]);
+ min_input = _mm_min_epi16(io_short[0], io_short[1]);
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8));
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4));
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2));
+ max = (int16_t)_mm_extract_epi16(max_input, 0);
+ min = (int16_t)_mm_extract_epi16(min_input, 0);
+ }
+
+ if (bd == 8 || (max < 4096 && min >= -4096)) {
+ idct4_sse2(io_short);
+ idct4_sse2(io_short);
+ io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+ io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+ io[0] = _mm_srai_epi16(io_short[0], 4);
+ io[1] = _mm_srai_epi16(io_short[1], 4);
+ } else {
+ if (max < 32767 && min > -32768) {
+ highbd_idct4_small_sse2(io);
+ highbd_idct4_small_sse2(io);
+ } else {
+ highbd_idct4_large_sse2(io);
+ highbd_idct4_large_sse2(io);
+ }
+ io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+ }
+
+ recon_and_store_4x4(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int a1, i;
+ tran_low_t out;
+ __m128i dc, d;
+
+ out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+ dc = _mm_set1_epi16(a1);
+
+ for (i = 0; i < 4; ++i) {
+ d = _mm_loadl_epi64((const __m128i *)dest);
+ d = add_clamp(d, dc, bd);
+ _mm_storel_epi64((__m128i *)dest, d);
+ dest += stride;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
new file mode 100644
index 0000000000..fe74d272ad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ __m128i io[4];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 8));
+ io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+ if (bd == 8) {
+ __m128i io_short[2];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[1]);
+ io_short[1] = _mm_packs_epi32(io[2], io[3]);
+ idct4_sse2(io_short);
+ idct4_sse2(io_short);
+ io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+ io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+ io[0] = _mm_srai_epi16(io_short[0], 4);
+ io[1] = _mm_srai_epi16(io_short[1], 4);
+ } else {
+ highbd_idct4_sse4_1(io);
+ highbd_idct4_sse4_1(io);
+ io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+ }
+
+ recon_and_store_4x4(io, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
new file mode 100644
index 0000000000..bb7a510e15
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static void highbd_idct8x8_half1d(__m128i *const io) {
+ __m128i step1[8], step2[8];
+
+ transpose_32bit_4x4x2(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[2] = io[4];
+ step1[1] = io[2];
+ step1[3] = io[6];
+ highbd_butterfly_sse2(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse2(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 2
+ highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]);
+ highbd_butterfly_sse2(step1[1], step1[3], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+ __m128i temp1[4], sign[2], step1[8], step2[8];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[1] = io[2];
+ abs_extend_64bit_sse2(io[1], temp1, sign);
+ step1[4] = multiplication_round_shift_sse2(temp1, sign, cospi_28_64);
+ step1[7] = multiplication_round_shift_sse2(temp1, sign, cospi_4_64);
+ abs_extend_64bit_sse2(io[3], temp1, sign);
+ step1[5] = multiplication_neg_round_shift_sse2(temp1, sign, cospi_20_64);
+ step1[6] = multiplication_round_shift_sse2(temp1, sign, cospi_12_64);
+
+ // stage 2
+ abs_extend_64bit_sse2(step1[0], temp1, sign);
+ step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+ abs_extend_64bit_sse2(step1[1], temp1, sign);
+ step2[2] = multiplication_round_shift_sse2(temp1, sign, cospi_24_64);
+ step2[3] = multiplication_round_shift_sse2(temp1, sign, cospi_8_64);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
+
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+ io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[4]);
+ io_short[1] = _mm_packs_epi32(io[1], io[5]);
+ io_short[2] = _mm_packs_epi32(io[2], io[6]);
+ io_short[3] = _mm_packs_epi32(io[3], io[7]);
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ io_short[4] = _mm_packs_epi32(io[8], io[12]);
+ io_short[5] = _mm_packs_epi32(io[9], io[13]);
+ io_short[6] = _mm_packs_epi32(io[10], io[14]);
+ io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+ vpx_idct8_sse2(io_short);
+ vpx_idct8_sse2(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ highbd_idct8x8_half1d(io);
+
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ highbd_idct8x8_half1d(&io[8]);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ io[4] = io[8];
+ io[5] = io[9];
+ io[6] = io[10];
+ io[7] = io[11];
+ highbd_idct8x8_half1d(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_half1d(&io[8]);
+
+ highbd_idct8x8_final_round(io);
+ }
+
+ recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], zero);
+ io_short[1] = _mm_packs_epi32(io[1], zero);
+ io_short[2] = _mm_packs_epi32(io[2], zero);
+ io_short[3] = _mm_packs_epi32(io[3], zero);
+
+ idct8x8_12_add_kernel_sse2(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ highbd_idct8x8_12_half1d(io);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ highbd_idct8x8_12_half1d(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_12_half1d(&io[8]);
+
+ highbd_idct8x8_final_round(io);
+ }
+
+ recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ highbd_idct_1_add_kernel(input, dest, stride, bd, 8);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
new file mode 100644
index 0000000000..8b2e3d2415
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) {
+ __m128i step1[8], step2[8];
+
+ transpose_32bit_4x4x2(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[2] = io[4];
+ step1[1] = io[2];
+ step1[3] = io[6];
+ highbd_butterfly_sse4_1(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse4_1(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 2
+ highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]);
+ highbd_butterfly_sse4_1(step1[1], step1[3], cospi_24_64, cospi_8_64,
+ &step2[2], &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+ __m128i temp1[2], step1[8], step2[8];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[1] = io[2];
+ extend_64bit(io[1], temp1);
+ step1[4] = multiplication_round_shift_sse4_1(temp1, cospi_28_64);
+ step1[7] = multiplication_round_shift_sse4_1(temp1, cospi_4_64);
+ extend_64bit(io[3], temp1);
+ step1[5] = multiplication_round_shift_sse4_1(temp1, -cospi_20_64);
+ step1[6] = multiplication_round_shift_sse4_1(temp1, cospi_12_64);
+
+ // stage 2
+ extend_64bit(step1[0], temp1);
+ step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+ extend_64bit(step1[1], temp1);
+ step2[2] = multiplication_round_shift_sse4_1(temp1, cospi_24_64);
+ step2[3] = multiplication_round_shift_sse4_1(temp1, cospi_8_64);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
+
+void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+ io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[4]);
+ io_short[1] = _mm_packs_epi32(io[1], io[5]);
+ io_short[2] = _mm_packs_epi32(io[2], io[6]);
+ io_short[3] = _mm_packs_epi32(io[3], io[7]);
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ io_short[4] = _mm_packs_epi32(io[8], io[12]);
+ io_short[5] = _mm_packs_epi32(io[9], io[13]);
+ io_short[6] = _mm_packs_epi32(io[10], io[14]);
+ io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+ vpx_idct8_sse2(io_short);
+ vpx_idct8_sse2(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
+
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ io[4] = io[8];
+ io[5] = io[9];
+ io[6] = io[10];
+ io[7] = io[11];
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+
+ highbd_idct8x8_final_round(io);
+ }
+
+ recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], zero);
+ io_short[1] = _mm_packs_epi32(io[1], zero);
+ io_short[2] = _mm_packs_epi32(io[2], zero);
+ io_short[3] = _mm_packs_epi32(io[3], zero);
+
+ idct8x8_12_add_kernel_ssse3(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ highbd_idct8x8_12_half1d(io);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ highbd_idct8x8_12_half1d(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_12_half1d(&io[8]);
+
+ highbd_idct8x8_final_round(io);
+ }
+
+ recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
new file mode 100644
index 0000000000..43634aea3a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)*dst, val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_16_unpacklo(&dst, stride, &row0);
+ h_store_16_unpacklo(&dst, stride, &row1);
+ h_store_16_unpacklo(&dst, stride, &row2);
+ h_store_16_unpacklo(&dst, stride, &row3);
+ h_store_16_unpackhi(&dst, stride, &row4);
+ h_store_16_unpackhi(&dst, stride, &row5);
+ h_store_16_unpackhi(&dst, stride, &row6);
+ h_store_16_unpackhi(&dst, stride, &row7);
+ }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_32_unpacklo(&dst, stride, &row0);
+ h_store_32_unpacklo(&dst, stride, &row1);
+ h_store_32_unpacklo(&dst, stride, &row2);
+ h_store_32_unpacklo(&dst, stride, &row3);
+ h_store_32_unpackhi(&dst, stride, &row4);
+ h_store_32_unpackhi(&dst, stride, &row5);
+ h_store_32_unpackhi(&dst, stride, &row6);
+ h_store_32_unpackhi(&dst, stride, &row7);
+ }
+}
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+ const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ _mm_storel_epi64((__m128i *)dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)above;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+ const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+ const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)left;
+ (void)bd;
+ dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_8x8(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+ const __m128i sum_lo = dc_sum_8(ref);
+ const __m128i sum_hi = dc_sum_8(ref + 8);
+ return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < 16; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16x16(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sum_a = dc_sum_16(ref);
+ const __m128i sum_b = dc_sum_16(ref + 16);
+ // 12 bit bd will outrange, so expand to 32 bit before adding final total
+ return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+ _mm_unpacklo_epi16(sum_b, zero));
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < 32; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(left);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)above;
+ (void)bd;
+ dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(above);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)left;
+ (void)bd;
+ dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_32x32(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+ const __m128i *z) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a = _mm_avg_epu16(*x, *z);
+ const __m128i b =
+ _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+ return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+ const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+ const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+ const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+ const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+ const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
+ const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
+ const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
+ const __m128i row0 = _mm_srli_si128(avg2, 6);
+ const __m128i row1 = _mm_srli_si128(avg3, 4);
+ const __m128i row2 = _mm_srli_si128(avg2, 4);
+ const __m128i row3 = _mm_srli_si128(avg3, 2);
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+
+ dst -= stride;
+ dst[0] = _mm_extract_epi16(avg3, 1);
+ dst[stride] = _mm_extract_epi16(avg3, 0);
+}
+
+void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+ const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+ const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+ const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+ const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+ const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
+ const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
+ const __m128i row0 = _mm_srli_si128(avg3, 6);
+ const __m128i row1 = _mm_srli_si128(avg3, 4);
+ const __m128i row2 = _mm_srli_si128(avg3, 2);
+ const __m128i row3 = avg3;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const __m128i XXXXXABC = _mm_castps_si128(
+ _mm_loadh_pi(_mm_setzero_ps(), (const __m64 *)(above - 1)));
+ const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
+ const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
+ const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
+ const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
+ const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
+ const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
+ const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
+ const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
+ const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
+ const __m128i row2 = _mm_srli_si128(row3, 4);
+ const __m128i row1 = _mm_srli_si128(row3, 8);
+ const __m128i row0 = _mm_srli_si128(avg3, 4);
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst[0] = _mm_extract_epi16(avg2, 3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left);
+ const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff);
+ const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000);
+ const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2);
+ const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4);
+ const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00);
+ const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0);
+ const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3);
+ const __m128i row1 = _mm_srli_si128(row0, 4);
+ const __m128i row2 = _mm_srli_si128(row0, 8);
+ const __m128i row3 = LLLL0000;
+ (void)above;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+ const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0);
+ const __m128i row0 = avg2;
+ const __m128i row1 = avg3;
+ const __m128i row2 = _mm_srli_si128(avg2, 2);
+ const __m128i row3 = _mm_srli_si128(avg3, 2);
+ (void)left;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
new file mode 100644
index 0000000000..d673fac493
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
@@ -0,0 +1,930 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+ const __m128i *z) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a = _mm_avg_epu16(*x, *z);
+ const __m128i b =
+ _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+ return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+ (void)left;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, avg3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+ dst[3] = above[7]; // aka H
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+ __m128i *row, const __m128i *ar) {
+ *row = _mm_alignr_epi8(*ar, *row, 2);
+ _mm_store_si128((__m128i *)*dst, *row);
+ *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+ const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+ const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+ const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+ __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+ (void)left;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, avg3);
+ dst += stride;
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+ __m128i *row_0, __m128i *row_1,
+ const __m128i *ar) {
+ *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2);
+ *row_1 = _mm_alignr_epi8(*ar, *row_1, 2);
+ _mm_store_si128((__m128i *)*dst, *row_0);
+ _mm_store_si128((__m128i *)(*dst + 8), *row_1);
+ *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ (void)left;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ dst += stride;
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+}
+
+void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ int i;
+ (void)left;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+ dst += stride;
+ for (i = 1; i < 32; ++i) {
+ avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+ avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+ avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+ avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+ dst += stride;
+ }
+}
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ rotate_right_epu16[16]) = { 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 0, 1 };
+
+static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
+ *a = _mm_shuffle_epi8(*a, *rotrw);
+ return *a;
+}
+
+void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i IXABCDEF =
+ _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
+ const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
+ __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
+ __m128i rowa = avg2;
+ __m128i rowb = avg3;
+ int i;
+ (void)bd;
+ for (i = 0; i < 8; i += 2) {
+ _mm_store_si128((__m128i *)dst, rowa);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb);
+ dst += stride;
+ rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+ rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
+ }
+}
+
+void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+ const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+ const __m128i L1_ = _mm_srli_si128(L1, 2);
+ __m128i rowa_0 = avg2_0;
+ __m128i rowa_1 = avg2_1;
+ __m128i rowb_0 = avg3_0;
+ __m128i rowb_1 = avg3_1;
+ __m128i avg3_left[2];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+ avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+ for (i = 0; i < 2; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; j += 2) {
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+ dst += stride;
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+ rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+ }
+ }
+}
+
+void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+ const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+ const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+ const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
+ const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+ const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
+ const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
+ const __m128i L3_ = _mm_srli_si128(L3, 2);
+ __m128i rowa_0 = avg2_0;
+ __m128i rowa_1 = avg2_1;
+ __m128i rowa_2 = avg2_2;
+ __m128i rowa_3 = avg2_3;
+ __m128i rowb_0 = avg3_0;
+ __m128i rowb_1 = avg3_1;
+ __m128i rowb_2 = avg3_2;
+ __m128i rowb_3 = avg3_3;
+ __m128i avg3_left[4];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+ avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+ avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
+ avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
+ for (i = 0; i < 4; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; j += 2) {
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowb_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowb_3);
+ dst += stride;
+ rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+ rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
+ rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
+ rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+ rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+ }
+ }
+}
+
+void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i AXIJKLMN =
+ _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
+ const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
+ __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+ __m128i rowa = avg3;
+ int i;
+ (void)bd;
+ for (i = 0; i < 8; ++i) {
+ rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa);
+ dst += stride;
+ }
+}
+
+void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i B0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+ const __m128i C1 = _mm_srli_si128(B1, 2);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+ const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+ __m128i rowa_0 = avg3_0;
+ __m128i rowa_1 = avg3_1;
+ __m128i avg3_left[2];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+ avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+ for (i = 0; i < 2; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; ++j) {
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ dst += stride;
+ }
+ }
+}
+
+void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i B0 = _mm_load_si128((const __m128i *)above);
+ const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+ const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
+ const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
+ const __m128i C3 = _mm_srli_si128(B3, 2);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+ const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+ const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
+ const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
+ __m128i rowa_0 = avg3_0;
+ __m128i rowa_1 = avg3_1;
+ __m128i rowa_2 = avg3_2;
+ __m128i rowa_3 = avg3_3;
+ __m128i avg3_left[4];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+ avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+ avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
+ avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
+ for (i = 0; i < 4; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; ++j) {
+ rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+ rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+ dst += stride;
+ }
+ }
+}
+
+void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
+ const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
+ const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i AXIJKLMN =
+ _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
+ const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+ const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
+ const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
+ const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
+ const __m128i row0 =
+ _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
+ const __m128i row1 =
+ _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
+ const __m128i row2 =
+ _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
+ const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
+ const __m128i row4 =
+ _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
+ const __m128i row5 =
+ _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
+ const __m128i row6 =
+ _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
+ const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, row0);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row2);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row4);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row5);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row6);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row7);
+}
+
+void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_srli_si128(A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_srli_si128(A1, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+ const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+ const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+ const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+ const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+ __m128i row_0 = avg3_0;
+ __m128i row_1 = avg3_1;
+ __m128i avg2_avg3_left[2][2];
+ int i, j;
+ (void)bd;
+
+ avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+
+ for (j = 0; j < 2; ++j) {
+ for (i = 0; i < 2; ++i) {
+ const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ }
+ }
+}
+
+void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_srli_si128(A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_srli_si128(A3, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+ const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+ const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
+ const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
+ const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+ const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+ const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
+ const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
+ const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+ const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+ const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
+ const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
+ __m128i row_0 = avg3_0;
+ __m128i row_1 = avg3_1;
+ __m128i row_2 = avg3_2;
+ __m128i row_3 = avg3_3;
+ __m128i avg2_avg3_left[4][2];
+ int i, j;
+ (void)bd;
+
+ avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
+ avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
+ avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
+ avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
+
+ for (j = 0; j < 4; ++j) {
+ for (i = 0; i < 2; ++i) {
+ const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ }
+ }
+}
+
+static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *a, const __m128i *b) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+ *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left);
+ const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+ const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+ const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+ const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+ const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+ const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3);
+ const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3);
+ (void)above;
+ (void)bd;
+ d207_store_4x8(&dst, stride, &out_a, &out_b);
+ d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH);
+}
+
+static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *a, const __m128i *b,
+ const __m128i *c) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ _mm_store_si128((__m128i *)(*dst + 8), *b);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+ *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)left);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
+ const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+ const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+ const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+ const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+ (void)above;
+ (void)bd;
+ d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
+ d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
+ d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
+ d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
+}
+
+static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *a, const __m128i *b,
+ const __m128i *c, const __m128i *d,
+ const __m128i *e) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ _mm_store_si128((__m128i *)(*dst + 8), *b);
+ _mm_store_si128((__m128i *)(*dst + 16), *c);
+ _mm_store_si128((__m128i *)(*dst + 24), *d);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+ _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4));
+ _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+ _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8));
+ _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+ _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12));
+ _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12));
+ *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)left);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
+ const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+ const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+ const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+ const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+ const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+ const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+ const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
+ const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
+ const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
+ const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
+ (void)above;
+ (void)bd;
+ d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
+ d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
+ d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
+ d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
+ d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
+ d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
+ d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
+ d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
+}
+
+static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+ __m128i *a, __m128i *b, const __m128i *ar) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, *b);
+ *dst += stride;
+ *a = _mm_alignr_epi8(*ar, *a, 2);
+ *b = _mm_alignr_epi8(*ar, *b, 2);
+ _mm_store_si128((__m128i *)*dst, *a);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, *b);
+ *dst += stride;
+ *a = _mm_alignr_epi8(*ar, *a, 2);
+ *b = _mm_alignr_epi8(*ar, *b, 2);
+}
+
+void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+ const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+ const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+ const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+ __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+ __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+ (void)left;
+ (void)bd;
+ d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+ d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+}
+
+void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ int i;
+ (void)left;
+ (void)bd;
+ for (i = 0; i < 14; i += 2) {
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ dst += stride;
+ avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+ avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
+ avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+ avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
+ }
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+}
+
+void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+ __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+ int i;
+ (void)left;
+ (void)bd;
+ for (i = 0; i < 30; i += 2) {
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+ dst += stride;
+ avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+ avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2);
+ avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2);
+ avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2);
+ avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+ avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+ avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+ avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+ }
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
new file mode 100644
index 0000000000..caf506ac07
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -0,0 +1,453 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_XMM sse2
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ movq m0, [aboveq]
+ movq m2, [leftq]
+ paddw m0, m2
+ pshuflw m1, m0, 0xe
+ paddw m0, m1
+ pshuflw m1, m0, 0x1
+ paddw m0, m1
+ paddw m0, [GLOBAL(pw_4)]
+ psraw m0, 3
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3, one
+ mov oned, 0x00010001
+ lea stride3q, [strideq*3]
+ movd m3, oned
+ pshufd m3, m3, 0x0
+ paddw m0, m2
+ pmaddwd m0, m3
+ packssdw m0, m1
+ pmaddwd m0, m3
+ packssdw m0, m1
+ pmaddwd m0, m3
+ paddw m0, [GLOBAL(pw_8)]
+ psrlw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ lea dstq, [dstq+strideq*8]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m3, [aboveq+16]
+ mova m2, [leftq]
+ mova m4, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movhlps m2, m0
+ paddw m0, m2
+ punpcklwd m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ punpckldq m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ paddd m0, [GLOBAL(pw_16)]
+ psrad m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m0
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2+16], m0
+ lea dstq, [dstq+strideq*8]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [aboveq+32]
+ mova m4, [aboveq+48]
+ paddw m0, m2
+ paddw m3, m4
+ mova m2, [leftq]
+ mova m4, [leftq+16]
+ mova m5, [leftq+32]
+ mova m6, [leftq+48]
+ paddw m2, m4
+ paddw m5, m6
+ paddw m0, m3
+ paddw m2, m5
+ pxor m1, m1
+ paddw m0, m2
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ movhlps m2, m0
+ paddw m0, m2
+ punpcklwd m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ punpckldq m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ paddd m0, [GLOBAL(pw_32)]
+ psrad m0, 6
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16 ], m0
+ mova [dstq +32 ], m0
+ mova [dstq +48 ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16 ], m0
+ mova [dstq+strideq*2+32 ], m0
+ mova [dstq+strideq*2+48 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4+16 ], m0
+ mova [dstq+strideq*4+32 ], m0
+ mova [dstq+strideq*4+48 ], m0
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2 +16], m0
+ mova [dstq+stride3q*2 +32], m0
+ mova [dstq+stride3q*2 +48], m0
+ lea dstq, [dstq+strideq*8]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
+ movq m0, [aboveq]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ lea dstq, [dstq+strideq*8]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m1
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m1
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2+16], m1
+ lea dstq, [dstq+strideq*8]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ mova m2, [aboveq+32]
+ mova m3, [aboveq+48]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq +32], m2
+ mova [dstq +48], m3
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m1
+ mova [dstq+strideq*2 +32], m2
+ mova [dstq+strideq*2 +48], m3
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m1
+ mova [dstq+strideq*4 +32], m2
+ mova [dstq+strideq*4 +48], m3
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2 +16], m1
+ mova [dstq+stride3q*2 +32], m2
+ mova [dstq+stride3q*2 +48], m3
+ lea dstq, [dstq+strideq*8]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd
+ movd m1, [aboveq-2]
+ movq m0, [aboveq]
+ pshuflw m1, m1, 0x0
+ movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4
+ movlhps m1, m1 ; tl tl tl tl tl tl tl tl
+ ; Get the values to compute the maximum value at this bit depth
+ pcmpeqw m3, m3
+ movd m4, bdd
+ psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl
+ psllw m3, m4
+ pcmpeqw m2, m2
+ pxor m4, m4 ; min possible value
+ pxor m3, m2 ; max possible value
+ mova m1, [leftq]
+ pshuflw m2, m1, 0x0
+ pshuflw m5, m1, 0x55
+ movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2
+ paddw m2, m0
+ ;Clamp to the bit-depth
+ pminsw m2, m3
+ pmaxsw m2, m4
+ ;Store the values
+ movq [dstq ], m2
+ movhpd [dstq+strideq*2], m2
+ lea dstq, [dstq+strideq*4]
+ pshuflw m2, m1, 0xaa
+ pshuflw m5, m1, 0xff
+ movlhps m2, m5
+ paddw m2, m0
+ ;Clamp to the bit-depth
+ pminsw m2, m3
+ pmaxsw m2, m4
+ ;Store the values
+ movq [dstq ], m2
+ movhpd [dstq+strideq*2], m2
+ RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one
+ movd m1, [aboveq-2]
+ mova m0, [aboveq]
+ pshuflw m1, m1, 0x0
+ ; Get the values to compute the maximum value at this bit depth
+ mov oned, 1
+ pxor m3, m3
+ pxor m4, m4
+ pinsrw m3, oned, 0
+ pinsrw m4, bdd, 0
+ pshuflw m3, m3, 0x0
+ DEFINE_ARGS dst, stride, line, left
+ punpcklqdq m3, m3
+ mov lineq, -4
+ mova m2, m3
+ punpcklqdq m1, m1
+ psllw m3, m4
+ add leftq, 16
+ psubw m3, m2 ; max possible value
+ pxor m4, m4 ; min possible value
+ psubw m0, m1
+.loop:
+ movd m1, [leftq+lineq*4]
+ movd m2, [leftq+lineq*4+2]
+ pshuflw m1, m1, 0x0
+ pshuflw m2, m2, 0x0
+ punpcklqdq m1, m1
+ punpcklqdq m2, m2
+ paddw m1, m0
+ paddw m2, m0
+ ;Clamp to the bit-depth
+ pminsw m1, m3
+ pminsw m2, m3
+ pmaxsw m1, m4
+ pmaxsw m2, m4
+ ;Store the values
+ mova [dstq ], m1
+ mova [dstq+strideq*2], m2
+ lea dstq, [dstq+strideq*4]
+ inc lineq
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd
+ movd m2, [aboveq-2]
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ pshuflw m2, m2, 0x0
+ ; Get the values to compute the maximum value at this bit depth
+ pcmpeqw m3, m3
+ movd m4, bdd
+ punpcklqdq m2, m2
+ psllw m3, m4
+ pcmpeqw m5, m5
+ pxor m4, m4 ; min possible value
+ pxor m3, m5 ; max possible value
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -8
+ psubw m0, m2
+ psubw m1, m2
+.loop:
+ movd m7, [leftq]
+ pshuflw m5, m7, 0x0
+ pshuflw m2, m7, 0x55
+ punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1
+ punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2
+ paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1
+ paddw m5, m1 ; t5-tl+l1 to t8-tl+l1
+ pminsw m6, m3
+ pminsw m5, m3
+ pmaxsw m6, m4 ; Clamp to the bit-depth
+ pmaxsw m5, m4
+ mova [dstq ], m6
+ mova [dstq +16], m5
+ paddw m6, m2, m0
+ paddw m2, m1
+ pminsw m6, m3
+ pminsw m2, m3
+ pmaxsw m6, m4
+ pmaxsw m2, m4
+ mova [dstq+strideq*2 ], m6
+ mova [dstq+strideq*2+16], m2
+ lea dstq, [dstq+strideq*4]
+ inc lineq
+ lea leftq, [leftq+4]
+
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd
+ movd m0, [aboveq-2]
+ mova m1, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [aboveq+32]
+ mova m4, [aboveq+48]
+ pshuflw m0, m0, 0x0
+ ; Get the values to compute the maximum value at this bit depth
+ pcmpeqw m5, m5
+ movd m6, bdd
+ psllw m5, m6
+ pcmpeqw m7, m7
+ pxor m6, m6 ; min possible value
+ pxor m5, m7 ; max possible value
+ punpcklqdq m0, m0
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -16
+ psubw m1, m0
+ psubw m2, m0
+ psubw m3, m0
+ psubw m4, m0
+.loop:
+ movd m7, [leftq]
+ pshuflw m7, m7, 0x0
+ punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1
+ paddw m0, m7, m1
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq ], m0
+ paddw m0, m7, m2
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +16], m0
+ paddw m0, m7, m3
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +32], m0
+ paddw m0, m7, m4
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +48], m0
+ movd m7, [leftq+2]
+ pshuflw m7, m7, 0x0
+ punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2
+ paddw m0, m7, m1
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2 ], m0
+ paddw m0, m7, m2
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+16], m0
+ paddw m0, m7, m3
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+32], m0
+ paddw m0, m7, m4
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+48], m0
+ lea dstq, [dstq+strideq*4]
+ lea leftq, [leftq+4]
+ inc lineq
+ jnz .loop
+ REP_RET
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
new file mode 100644
index 0000000000..1d07391b02
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+// Note: There is no 64-bit bit-level shifting SIMD instruction. All
+// coefficients are left shifted by 2, so that dct_const_round_shift() can be
+// done by right shifting 2 bytes.
+
+static INLINE void extend_64bit(const __m128i in,
+ __m128i *const out /*out[2]*/) {
+ out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1
+ out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3
+}
+
+static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1,
+ const __m128i rounding) {
+ __m128i temp[2];
+ temp[0] = _mm_add_epi32(in0, rounding);
+ temp[1] = _mm_add_epi32(in1, rounding);
+ temp[0] = _mm_srai_epi32(temp[0], 4);
+ temp[1] = _mm_srai_epi32(temp[1], 4);
+ return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1,
+ const __m128i rounding) {
+ __m128i temp[2];
+ temp[0] = _mm_add_epi32(in0, rounding);
+ temp[1] = _mm_add_epi32(in1, rounding);
+ temp[0] = _mm_srai_epi32(temp[0], 5);
+ temp[1] = _mm_srai_epi32(temp[1], 5);
+ return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
+ const __m128i t =
+ _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0));
+ return _mm_srli_si128(t, 2);
+}
+
+static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) {
+ const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 2
+ const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 1, 3
+ return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3
+}
+
+static INLINE void abs_extend_64bit_sse2(const __m128i in,
+ __m128i *const out /*out[2]*/,
+ __m128i *const sign /*sign[2]*/) {
+ sign[0] = _mm_srai_epi32(in, 31);
+ out[0] = _mm_xor_si128(in, sign[0]);
+ out[0] = _mm_sub_epi32(out[0], sign[0]);
+ sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3
+ sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1
+ out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3
+ out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1
+}
+
+// Note: cospi must be non negative.
+static INLINE __m128i multiply_apply_sign_sse2(const __m128i in,
+ const __m128i sign,
+ const __m128i cospi) {
+ __m128i out = _mm_mul_epu32(in, cospi);
+ out = _mm_xor_si128(out, sign);
+ return _mm_sub_epi64(out, sign);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_round_shift_sse2(
+ const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+ const int c) {
+ const __m128i pair_c = pair_set_epi32(c << 2, 0);
+ __m128i t0, t1;
+
+ assert(c >= 0);
+ t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+ t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+ t0 = dct_const_round_shift_64bit(t0);
+ t1 = dct_const_round_shift_64bit(t1);
+
+ return pack_4(t0, t1);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_neg_round_shift_sse2(
+ const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+ const int c) {
+ const __m128i pair_c = pair_set_epi32(c << 2, 0);
+ __m128i t0, t1;
+
+ assert(c >= 0);
+ t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+ t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+ t0 = _mm_sub_epi64(_mm_setzero_si128(), t0);
+ t1 = _mm_sub_epi64(_mm_setzero_si128(), t1);
+ t0 = dct_const_round_shift_64bit(t0);
+ t1 = dct_const_round_shift_64bit(t1);
+
+ return pack_4(t0, t1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
+ const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
+ __m128i temp1[4], temp2[4], sign1[2], sign2[2];
+
+ assert(c0 >= 0);
+ assert(c1 >= 0);
+ abs_extend_64bit_sse2(in0, temp1, sign1);
+ abs_extend_64bit_sse2(in1, temp2, sign2);
+ temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1);
+ temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1);
+ temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0);
+ temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0);
+ temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0);
+ temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0);
+ temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1);
+ temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1);
+ temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+ temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+ temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+ temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+ temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+ *out0 = pack_4(temp1[0], temp1[1]);
+ *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0,
+ const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp[2], sign[2];
+
+ assert(c0 >= 0);
+ assert(c1 >= 0);
+ abs_extend_64bit_sse2(in, temp, sign);
+ *out0 = multiplication_round_shift_sse2(temp, sign, c0);
+ *out1 = multiplication_round_shift_sse2(temp, sign, c1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp[2], sign[2];
+
+ assert(c0 >= 0);
+ assert(c1 >= 0);
+ abs_extend_64bit_sse2(in, temp, sign);
+ *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1);
+ *out1 = multiplication_round_shift_sse2(temp, sign, c0);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
+ const __m128i in1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp1[2], temp2, sign[2];
+
+ temp2 = _mm_add_epi32(in0, in1);
+ abs_extend_64bit_sse2(temp2, temp1, sign);
+ *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+ temp2 = _mm_sub_epi32(in0, in1);
+ abs_extend_64bit_sse2(temp2, temp1, sign);
+ *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out,
+ int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm_add_epi32(in[i], in[bound - i]);
+ out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+static INLINE void highbd_idct8_stage4(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[7]);
+ out[1] = _mm_add_epi32(in[1], in[6]);
+ out[2] = _mm_add_epi32(in[2], in[5]);
+ out[3] = _mm_add_epi32(in[3], in[4]);
+ out[4] = _mm_sub_epi32(in[3], in[4]);
+ out[5] = _mm_sub_epi32(in[2], in[5]);
+ out[6] = _mm_sub_epi32(in[1], in[6]);
+ out[7] = _mm_sub_epi32(in[0], in[7]);
+}
+
+static INLINE void highbd_idct8x8_final_round(__m128i *const io) {
+ io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
+ io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
+ io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
+ io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
+ io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
+ io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
+ io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
+ io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
+}
+
+static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[15]);
+ out[1] = _mm_add_epi32(in[1], in[14]);
+ out[2] = _mm_add_epi32(in[2], in[13]);
+ out[3] = _mm_add_epi32(in[3], in[12]);
+ out[4] = _mm_add_epi32(in[4], in[11]);
+ out[5] = _mm_add_epi32(in[5], in[10]);
+ out[6] = _mm_add_epi32(in[6], in[9]);
+ out[7] = _mm_add_epi32(in[7], in[8]);
+ out[8] = _mm_sub_epi32(in[7], in[8]);
+ out[9] = _mm_sub_epi32(in[6], in[9]);
+ out[10] = _mm_sub_epi32(in[5], in[10]);
+ out[11] = _mm_sub_epi32(in[4], in[11]);
+ out[12] = _mm_sub_epi32(in[3], in[12]);
+ out[13] = _mm_sub_epi32(in[2], in[13]);
+ out[14] = _mm_sub_epi32(in[1], in[14]);
+ out[15] = _mm_sub_epi32(in[0], in[15]);
+}
+
+static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
+ const int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ // Faster than _mm_set1_epi16((1 << bd) - 1).
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i d;
+
+ d = _mm_adds_epi16(in0, in1);
+ d = _mm_max_epi16(d, zero);
+ d = _mm_min_epi16(d, max);
+
+ return d;
+}
+
+static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd,
+ const int size) {
+ int a1, i, j;
+ tran_low_t out;
+ __m128i dc, d;
+
+ out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+ a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);
+ dc = _mm_set1_epi16(a1);
+
+ for (i = 0; i < size; ++i) {
+ for (j = 0; j < size; j += 8) {
+ d = _mm_load_si128((const __m128i *)(&dest[j]));
+ d = add_clamp(d, dc, bd);
+ _mm_store_si128((__m128i *)(&dest[j]), d);
+ }
+ dest += stride;
+ }
+}
+
+static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
+ const int bd) {
+ __m128i d;
+
+ d = _mm_loadl_epi64((const __m128i *)dest);
+ d = add_clamp(d, in, bd);
+ _mm_storel_epi64((__m128i *)dest, d);
+}
+
+static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
+ const int stride, const int bd) {
+ __m128i d;
+
+ d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+ d = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
+ d = add_clamp(d, in, bd);
+ _mm_storel_epi64((__m128i *)(dest + 0 * stride), d);
+ _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
+}
+
+static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_4x2(in[0], dest, stride, bd);
+ dest += 2 * stride;
+ recon_and_store_4x2(in[1], dest, stride, bd);
+}
+
+static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
+ const int stride, const int bd) {
+ __m128i d;
+
+ d = _mm_load_si128((const __m128i *)(*dest));
+ d = add_clamp(d, in, bd);
+ _mm_store_si128((__m128i *)(*dest), d);
+ *dest += stride;
+}
+
+static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_8(in[0], &dest, stride, bd);
+ recon_and_store_8(in[1], &dest, stride, bd);
+ recon_and_store_8(in[2], &dest, stride, bd);
+ recon_and_store_8(in[3], &dest, stride, bd);
+ recon_and_store_8(in[4], &dest, stride, bd);
+ recon_and_store_8(in[5], &dest, stride, bd);
+ recon_and_store_8(in[6], &dest, stride, bd);
+ recon_and_store_8(in[7], &dest, stride, bd);
+}
+
+static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
+ const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
+ const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
+ return _mm_packs_epi32(t0, t1);
+}
+
+static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input,
+ const int stride,
+ __m128i *const in) {
+ in[0] = load_pack_8_32bit(input + 0 * stride);
+ in[1] = load_pack_8_32bit(input + 1 * stride);
+ in[2] = load_pack_8_32bit(input + 2 * stride);
+ in[3] = load_pack_8_32bit(input + 3 * stride);
+ in[4] = load_pack_8_32bit(input + 4 * stride);
+ in[5] = load_pack_8_32bit(input + 5 * stride);
+ in[6] = load_pack_8_32bit(input + 6 * stride);
+ in[7] = load_pack_8_32bit(input + 7 * stride);
+ transpose_16bit_8x8(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input,
+ const int stride,
+ __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+ in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4));
+ in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+ in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4));
+ in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+ in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4));
+ in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+ in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4));
+ transpose_32bit_8x4(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input,
+ const int stride,
+ __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ transpose_32bit_4x4(in, in);
+}
+
+static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
+ const int bd) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ __m128i out;
+
+ out = _mm_adds_epi16(in, final_rounding);
+ out = _mm_srai_epi16(out, 6);
+ recon_and_store_8(out, &dest, 0, bd);
+}
+
+static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
+ const int bd) {
+ const __m128i final_rounding = _mm_set1_epi32(1 << 5);
+ __m128i out;
+
+ out = _mm_add_epi32(in, final_rounding);
+ out = _mm_srai_epi32(out, 6);
+ out = _mm_packs_epi32(out, out);
+ recon_and_store_4(out, dest, bd);
+}
+
+#endif // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
new file mode 100644
index 0000000000..f446bb13f3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_config.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+
+static INLINE __m128i multiplication_round_shift_sse4_1(
+ const __m128i *const in /*in[2]*/, const int c) {
+ const __m128i pair_c = pair_set_epi32(c * 4, 0);
+ __m128i t0, t1;
+
+ t0 = _mm_mul_epi32(in[0], pair_c);
+ t1 = _mm_mul_epi32(in[1], pair_c);
+ t0 = dct_const_round_shift_64bit(t0);
+ t1 = dct_const_round_shift_64bit(t1);
+
+ return pack_4(t0, t1);
+}
+
+static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+ const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+ __m128i temp1[4], temp2[4];
+
+ extend_64bit(in0, temp1);
+ extend_64bit(in1, temp2);
+ temp1[2] = _mm_mul_epi32(temp1[0], pair_c1);
+ temp1[3] = _mm_mul_epi32(temp1[1], pair_c1);
+ temp1[0] = _mm_mul_epi32(temp1[0], pair_c0);
+ temp1[1] = _mm_mul_epi32(temp1[1], pair_c0);
+ temp2[2] = _mm_mul_epi32(temp2[0], pair_c0);
+ temp2[3] = _mm_mul_epi32(temp2[1], pair_c0);
+ temp2[0] = _mm_mul_epi32(temp2[0], pair_c1);
+ temp2[1] = _mm_mul_epi32(temp2[1], pair_c1);
+ temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+ temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+ temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+ temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+ temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+ *out0 = pack_4(temp1[0], temp1[1]);
+ *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0,
+ const __m128i in1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp1[2], temp2;
+
+ temp2 = _mm_add_epi32(in0, in1);
+ extend_64bit(temp2, temp1);
+ *out0 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+ temp2 = _mm_sub_epi32(in0, in1);
+ extend_64bit(temp2, temp1);
+ *out1 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+}
+
+static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp[2];
+
+ extend_64bit(in, temp);
+ *out0 = multiplication_round_shift_sse4_1(temp, c0);
+ *out1 = multiplication_round_shift_sse4_1(temp, c1);
+}
+
+static INLINE void highbd_idct4_sse4_1(__m128i *const io) {
+ __m128i temp[2], step[4];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
+ extend_64bit(temp[0], temp);
+ step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+ temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
+ extend_64bit(temp[0], temp);
+ step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+ highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+ &step[3]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
+
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io);
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/);
+
+#endif // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
new file mode 100644
index 0000000000..9f45623dee
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -0,0 +1,1140 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
+ __m128i ubounded;
+ __m128i lbounded;
+ __m128i retval;
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i t80, max, min;
+
+ if (bd == 8) {
+ t80 = _mm_set1_epi16(0x80);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
+ } else if (bd == 10) {
+ t80 = _mm_set1_epi16(0x200);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
+ } else { // bd == 12
+ t80 = _mm_set1_epi16(0x800);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
+ }
+
+ min = _mm_subs_epi16(zero, t80);
+
+ ubounded = _mm_cmpgt_epi16(value, max);
+ lbounded = _mm_cmplt_epi16(value, min);
+ retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
+ ubounded = _mm_and_si128(ubounded, max);
+ lbounded = _mm_and_si128(lbounded, min);
+ retval = _mm_or_si128(retval, ubounded);
+ retval = _mm_or_si128(retval, lbounded);
+ return retval;
+}
+
+// TODO(debargha, peter): Break up large functions into smaller ones
+// in this file.
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i blimit_v, limit_v, thresh_v;
+ __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
+ __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
+ __m128i ps1, qs1, ps0, qs0;
+ __m128i abs_p0q0, abs_p1q1, ffff, work;
+ __m128i filt, work_a, filter1, filter2;
+ __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
+ __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
+ __m128i flat2_q0, flat2_p0;
+ __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3;
+ __m128i t4, t3, t80, t1;
+ __m128i eight, four;
+
+ if (bd == 8) {
+ blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+ limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+ thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
+ } else if (bd == 10) {
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
+ } else { // bd == 12
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
+ }
+
+ q4 = _mm_load_si128((__m128i *)(s + 4 * pitch));
+ p4 = _mm_load_si128((__m128i *)(s - 5 * pitch));
+ q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+ p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+ q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+ p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+ q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+ p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+ q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
+ p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
+
+ // highbd_filter_mask
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+ ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+
+ // highbd_hev_mask (in C code this is actually called from highbd_filter4)
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
+ _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+
+ mask = _mm_subs_epu16(mask, limit_v);
+ mask = _mm_cmpeq_epi16(mask, zero); // return ~mask
+
+ // lp filter
+ // highbd_filter4
+ t4 = _mm_set1_epi16(4);
+ t3 = _mm_set1_epi16(3);
+ if (bd == 8)
+ t80 = _mm_set1_epi16(0x80);
+ else if (bd == 10)
+ t80 = _mm_set1_epi16(0x200);
+ else // bd == 12
+ t80 = _mm_set1_epi16(0x800);
+
+ t1 = _mm_set1_epi16(0x1);
+
+ ps1 = _mm_subs_epi16(p1, t80);
+ qs1 = _mm_subs_epi16(q1, t80);
+ ps0 = _mm_subs_epi16(p0, t80);
+ qs0 = _mm_subs_epi16(q0, t80);
+
+ filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
+ hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+ filt = _mm_and_si128(filt, mask);
+ filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+ filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+ // Filter1 >> 3
+ filter1 = _mm_srai_epi16(filter1, 0x3);
+ filter2 = _mm_srai_epi16(filter2, 0x3);
+
+ qs0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+ ps0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(hev, filt);
+ qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+ t80);
+ ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+ t80);
+
+ // end highbd_filter4
+ // loopfilter done
+
+ // highbd_flat_mask4
+ flat = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
+ _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
+ _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
+ flat = _mm_max_epi16(work, flat);
+ work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ flat = _mm_max_epi16(work, flat);
+
+ if (bd == 8)
+ flat = _mm_subs_epu16(flat, one);
+ else if (bd == 10)
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ // end flat_mask4
+
+ // flat & mask = flat && mask (as used in filter8)
+ // (because, in both vars, each block of 16 either all 1s or all 0s)
+ flat = _mm_and_si128(flat, mask);
+
+ p5 = _mm_load_si128((__m128i *)(s - 6 * pitch));
+ q5 = _mm_load_si128((__m128i *)(s + 5 * pitch));
+ p6 = _mm_load_si128((__m128i *)(s - 7 * pitch));
+ q6 = _mm_load_si128((__m128i *)(s + 6 * pitch));
+ p7 = _mm_load_si128((__m128i *)(s - 8 * pitch));
+ q7 = _mm_load_si128((__m128i *)(s + 7 * pitch));
+
+ // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
+ // but referred to as p0-p4 & q0-q4 in fn)
+ flat2 = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
+ _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
+ _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
+ _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
+ _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ if (bd == 8)
+ flat2 = _mm_subs_epu16(flat2, one);
+ else if (bd == 10)
+ flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
+
+ flat2 = _mm_cmpeq_epi16(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ // end highbd_flat_mask5
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ eight = _mm_set1_epi16(8);
+ four = _mm_set1_epi16(4);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ flat2_p0 =
+ _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
+ flat2_q0 =
+ _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
+ flat_p0 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
+ flat_q0 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
+
+ sum_p7 = _mm_add_epi16(p7, p7);
+ sum_q7 = _mm_add_epi16(q7, q7);
+ sum_p3 = _mm_add_epi16(p3, p3);
+ sum_q3 = _mm_add_epi16(q3, q3);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
+ flat2_p1 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
+ flat2_q1 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
+ flat_p1 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
+ flat_q1 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ sum_p3 = _mm_add_epi16(sum_p3, p3);
+ sum_q3 = _mm_add_epi16(sum_q3, q3);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
+ flat2_p2 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
+ flat2_q2 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
+ flat_p2 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
+ flat_q2 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
+ flat2_p3 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
+ flat2_q3 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
+ flat2_p4 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
+ flat2_q4 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
+ flat2_p5 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
+ flat2_q5 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
+ flat2_p6 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
+ flat2_q6 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ // highbd_filter8
+ p2 = _mm_andnot_si128(flat, p2);
+ // p2 remains unchanged if !(flat && mask)
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ // when (flat && mask)
+ p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values
+
+ ps1 = _mm_andnot_si128(flat, ps1);
+ // p1 takes the value assigned to in in filter4 if !(flat && mask)
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ // when (flat && mask)
+ p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values
+ qs1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values
+
+ ps0 = _mm_andnot_si128(flat, ps0);
+ // p0 takes the value assigned to in in filter4 if !(flat && mask)
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ // when (flat && mask)
+ p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values
+ qs0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values
+ // end highbd_filter8
+
+ // highbd_filter16
+ p6 = _mm_andnot_si128(flat2, p6);
+ // p6 remains unchanged if !(flat2 && flat && mask)
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ // get values for when (flat2 && flat && mask)
+ p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values
+ q6 = _mm_andnot_si128(flat2, q6);
+ // q6 remains unchanged if !(flat2 && flat && mask)
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ // get values for when (flat2 && flat && mask)
+ q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values
+ _mm_store_si128((__m128i *)(s - 7 * pitch), p6);
+ _mm_store_si128((__m128i *)(s + 6 * pitch), q6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ // p5 remains unchanged if !(flat2 && flat && mask)
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ // get values for when (flat2 && flat && mask)
+ p5 = _mm_or_si128(p5, flat2_p5);
+ // full list of p5 values
+ q5 = _mm_andnot_si128(flat2, q5);
+ // q5 remains unchanged if !(flat2 && flat && mask)
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ // get values for when (flat2 && flat && mask)
+ q5 = _mm_or_si128(q5, flat2_q5);
+ // full list of q5 values
+ _mm_store_si128((__m128i *)(s - 6 * pitch), p5);
+ _mm_store_si128((__m128i *)(s + 5 * pitch), q5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ // p4 remains unchanged if !(flat2 && flat && mask)
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ // get values for when (flat2 && flat && mask)
+ p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values
+ q4 = _mm_andnot_si128(flat2, q4);
+ // q4 remains unchanged if !(flat2 && flat && mask)
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ // get values for when (flat2 && flat && mask)
+ q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values
+ _mm_store_si128((__m128i *)(s - 5 * pitch), p4);
+ _mm_store_si128((__m128i *)(s + 4 * pitch), q4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ // p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ // get values for when (flat2 && flat && mask)
+ p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values
+ q3 = _mm_andnot_si128(flat2, q3);
+ // q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ // get values for when (flat2 && flat && mask)
+ q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values
+ _mm_store_si128((__m128i *)(s - 4 * pitch), p3);
+ _mm_store_si128((__m128i *)(s + 3 * pitch), q3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ // p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ // get values for when (flat2 && flat && mask)
+ p2 = _mm_or_si128(p2, flat2_p2);
+ // full list of p2 values
+ q2 = _mm_andnot_si128(flat2, q2);
+ // q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ // get values for when (flat2 && flat && mask)
+ q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values
+ _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+ _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ // p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ // get values for when (flat2 && flat && mask)
+ p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values
+ q1 = _mm_andnot_si128(flat2, q1);
+ // q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ // get values for when (flat2 && flat && mask)
+ q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values
+ _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ // p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ // get values for when (flat2 && flat && mask)
+ p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values
+ q0 = _mm_andnot_si128(flat2, q0);
+ // q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ // get values for when (flat2 && flat && mask)
+ q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values
+ _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_store_si128((__m128i *)(s - 0 * pitch), q0);
+}
+
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd);
+ vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd);
+}
+
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i blimit_v, limit_v, thresh_v;
+ __m128i mask, hev, flat;
+ __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+ __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+ __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+ __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+ __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+ __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+ __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
+ __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i ffff = _mm_cmpeq_epi16(one, one);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_shft;
+
+ const __m128i t4 = _mm_set1_epi16(4);
+ const __m128i t3 = _mm_set1_epi16(3);
+ __m128i t80;
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i ps1, ps0, qs0, qs1;
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ if (bd == 8) {
+ blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+ limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+ thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
+ t80 = _mm_set1_epi16(0x80);
+ } else if (bd == 10) {
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
+ t80 = _mm_set1_epi16(0x200);
+ } else { // bd == 12
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
+ t80 = _mm_set1_epi16(0x800);
+ }
+
+ ps1 = _mm_subs_epi16(p1, t80);
+ ps0 = _mm_subs_epi16(p0, t80);
+ qs0 = _mm_subs_epi16(q0, t80);
+ qs1 = _mm_subs_epi16(q1, t80);
+
+ // filter_mask and hev_mask
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
+ mask = _mm_max_epi16(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ mask = _mm_max_epi16(abs_q1q0, mask);
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+ mask = _mm_subs_epu16(mask, limit_v);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // flat_mask4
+ flat = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
+ flat = _mm_max_epi16(work, flat);
+ flat = _mm_max_epi16(abs_p1p0, flat);
+ flat = _mm_max_epi16(abs_q1q0, flat);
+
+ if (bd == 8)
+ flat = _mm_subs_epu16(flat, one);
+ else if (bd == 10)
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask); // flat & mask
+
+ // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+ workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+
+ // lp filter
+ filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+ filt = _mm_and_si128(filt, hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = signed_char_clamp_bd_sse2(filt, bd);
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi16(filt, t4);
+ filter2 = _mm_adds_epi16(filt, t3);
+
+ // Filter1 >> 3
+ filter1 = signed_char_clamp_bd_sse2(filter1, bd);
+ filter1 = _mm_srai_epi16(filter1, 3);
+
+ // Filter2 >> 3
+ filter2 = signed_char_clamp_bd_sse2(filter2, bd);
+ filter2 = _mm_srai_epi16(filter2, 3);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+ filt = _mm_andnot_si128(hev, filt);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ q0 = _mm_load_si128((__m128i *)flat_oq0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ q1 = _mm_load_si128((__m128i *)flat_oq1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q2 = _mm_load_si128((__m128i *)flat_oq2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ p0 = _mm_load_si128((__m128i *)flat_op0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ p1 = _mm_load_si128((__m128i *)flat_op1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p2 = _mm_load_si128((__m128i *)flat_op2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+
+ _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+ _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_store_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
+ _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
+}
+
+void vpx_highbd_lpf_horizontal_8_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i blimit_v, limit_v, thresh_v;
+ __m128i mask, hev, flat;
+ __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+ const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+ __m128i work;
+ const __m128i t4 = _mm_set1_epi16(4);
+ const __m128i t3 = _mm_set1_epi16(3);
+ __m128i t80;
+ __m128i tff80;
+ __m128i tffe0;
+ __m128i t1f;
+ // equivalent to shifting 0x1f left by bitdepth - 8
+ // and setting new bits to 1
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i t7f;
+ // equivalent to shifting 0x7f left by bitdepth - 8
+ // and setting new bits to 1
+ __m128i ps1, ps0, qs0, qs1;
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ if (bd == 8) {
+ blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+ limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+ thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
+ t80 = _mm_set1_epi16(0x80);
+ tff80 = _mm_set1_epi16((int16_t)0xff80);
+ tffe0 = _mm_set1_epi16((int16_t)0xffe0);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
+ } else if (bd == 10) {
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
+ t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
+ tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 2);
+ tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 2);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
+ } else { // bd == 12
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
+ t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
+ tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 4);
+ tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 4);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
+ }
+
+ ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
+ ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
+ qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
+ qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
+
+ // filter_mask and hev_mask
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
+ mask = _mm_max_epi16(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+ mask = _mm_subs_epu16(mask, limit_v);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // filter4
+ filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+ filt = _mm_and_si128(filt, hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+ filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0
+ filter1 = _mm_and_si128(filter1, t1f); // clamp the range
+ filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi16(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, tffe0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filter1, t1);
+ work_a = _mm_cmpgt_epi16(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, tff80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ q0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+ q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+ t80);
+ p0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+ p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+ t80);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
+}
+
+static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
+ int out_p, int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
+ do {
+ uint16_t *in = src[idx8x8];
+ uint16_t *out = dst[idx8x8];
+
+ p0 =
+ _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
+ p1 =
+ _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
+ p2 =
+ _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
+ p3 =
+ _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
+ p4 =
+ _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
+ p5 =
+ _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
+ p6 =
+ _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
+ p7 =
+ _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
+ // 00 10 01 11 02 12 03 13
+ x0 = _mm_unpacklo_epi16(p0, p1);
+ // 20 30 21 31 22 32 23 33
+ x1 = _mm_unpacklo_epi16(p2, p3);
+ // 40 50 41 51 42 52 43 53
+ x2 = _mm_unpacklo_epi16(p4, p5);
+ // 60 70 61 71 62 72 63 73
+ x3 = _mm_unpacklo_epi16(p6, p7);
+ // 00 10 20 30 01 11 21 31
+ x4 = _mm_unpacklo_epi32(x0, x1);
+ // 40 50 60 70 41 51 61 71
+ x5 = _mm_unpacklo_epi32(x2, x3);
+ // 00 10 20 30 40 50 60 70
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 01 11 21 31 41 51 61 71
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
+ // 00 10 20 30 40 50 60 70
+ _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
+ // 01 11 21 31 41 51 61 71
+
+ // 02 12 22 32 03 13 23 33
+ x4 = _mm_unpackhi_epi32(x0, x1);
+ // 42 52 62 72 43 53 63 73
+ x5 = _mm_unpackhi_epi32(x2, x3);
+ // 02 12 22 32 42 52 62 72
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
+ // 02 12 22 32 42 52 62 72
+ _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
+ // 03 13 23 33 43 53 63 73
+
+ // 04 14 05 15 06 16 07 17
+ x0 = _mm_unpackhi_epi16(p0, p1);
+ // 24 34 25 35 26 36 27 37
+ x1 = _mm_unpackhi_epi16(p2, p3);
+ // 44 54 45 55 46 56 47 57
+ x2 = _mm_unpackhi_epi16(p4, p5);
+ // 64 74 65 75 66 76 67 77
+ x3 = _mm_unpackhi_epi16(p6, p7);
+ // 04 14 24 34 05 15 25 35
+ x4 = _mm_unpacklo_epi32(x0, x1);
+ // 44 54 64 74 45 55 65 75
+ x5 = _mm_unpacklo_epi32(x2, x3);
+ // 04 14 24 34 44 54 64 74
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 05 15 25 35 45 55 65 75
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
+ // 04 14 24 34 44 54 64 74
+ _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
+ // 05 15 25 35 45 55 65 75
+
+ // 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi32(x0, x1);
+ // 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi32(x2, x3);
+ // 06 16 26 36 46 56 66 76
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
+ // 06 16 26 36 46 56 66 76
+ _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
+ // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
+ uint16_t *out, int out_p) {
+ uint16_t *src0[1];
+ uint16_t *src1[1];
+ uint16_t *dest0[1];
+ uint16_t *dest1[1];
+ src0[0] = in0;
+ src1[0] = in1;
+ dest0[0] = out;
+ dest1[0] = out + 8;
+ highbd_transpose(src0, in_p, dest0, out_p, 1);
+ highbd_transpose(src1, in_p, dest1, out_p, 1);
+}
+
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+ uint16_t *src[1];
+ uint16_t *dst[1];
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ highbd_transpose(src, pitch, dst, 8, 1);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, pitch, 1);
+}
+
+void vpx_highbd_lpf_vertical_4_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ // Transpose 8x16
+ highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+ dst[0] = s - 4;
+ dst[1] = s - 4 + pitch * 8;
+
+ // Transpose back
+ highbd_transpose(src, 16, dst, pitch, 2);
+}
+
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+ uint16_t *src[1];
+ uint16_t *dst[1];
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ highbd_transpose(src, pitch, dst, 8, 1);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, pitch, 1);
+}
+
+void vpx_highbd_lpf_vertical_8_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ // Transpose 8x16
+ highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+
+ dst[0] = s - 4;
+ dst[1] = s - 4 + pitch * 8;
+
+ // Transpose back
+ highbd_transpose(src, 16, dst, pitch, 2);
+}
+
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ src[0] = s - 8;
+ src[1] = s;
+ dst[0] = t_dst;
+ dst[1] = t_dst + 8 * 8;
+
+ // Transpose 16x8
+ highbd_transpose(src, pitch, dst, 8, 2);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
+ bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8 * 8;
+ dst[0] = s - 8;
+ dst[1] = s;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, pitch, 2);
+}
+
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
+
+ // Transpose 16x16
+ highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+ highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
+ thresh, bd);
+
+ // Transpose back
+ highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+ highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch,
+ pitch);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 0000000000..fbebd7db1c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i sign = _mm_srai_epi16(*p, 15);
+ const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+ const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void update_qp(__m256i *qp) {
+ int i;
+ for (i = 0; i < 5; ++i) {
+ qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+ }
+}
+
+static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ const int16_t *quant_shift_ptr,
+ __m256i *qp, int log_scale) {
+ const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
+ const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+ const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+ init_one_qp(&zbin, &qp[0]);
+ init_one_qp(&round, &qp[1]);
+ init_one_qp(&quant, &qp[2]);
+ init_one_qp(&dequant, &qp[3]);
+ init_one_qp(&quant_shift, &qp[4]);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
+ qp[0] = _mm256_add_epi32(qp[0], rnd);
+ qp[0] = _mm256_srai_epi32(qp[0], log_scale);
+
+ qp[1] = _mm256_add_epi32(qp[1], rnd);
+ qp[1] = _mm256_srai_epi32(qp[1], log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+ // calculating the zbin mask.
+ qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16. The output, 16 int32_t is save in *p.
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+ const __m256i *y) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
+ __m256i eobmax,
+ __m256i nz_mask) {
+ const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+ const __m256i packed_nz_mask_perm =
+ _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+ const __m256i iscan =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+ const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+ return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+// Get the max eob from the lower 128 bits.
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+ return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+ return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize(const __m256i *qp,
+ const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (_mm256_movemask_epi8(zbin_mask) == 0) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+ {
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
+ const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ }
+}
+
+void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int step = 8;
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+ (void)scan;
+
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
+
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+ const __m256i *y,
+ int log_scale) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE void quantize_b_32x32(
+ const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (_mm256_movemask_epi8(zbin_mask) == 0) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+
+ {
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1);
+ const __m256i abs_dq =
+ _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ }
+}
+
+void vpx_highbd_quantize_b_32x32_avx2(
+ const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+ const unsigned int step = 8;
+ intptr_t n_coeffs = 32 * 32;
+ const int16_t *iscan = scan_order->iscan;
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+
+ init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr,
+ mb_plane->quant_shift, qp, 1);
+
+ quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
new file mode 100644
index 0000000000..a5d874f3bc
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
+ __m128i zbins[2];
+ __m128i nzbins[2];
+
+ zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
+ (int)zbin_ptr[0]);
+ zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ (void)scan;
+
+ memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = ((int)count / 4) - 1; i >= 0; i--) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (test == 0xffff)
+ non_zero_regs--;
+ else
+ break;
+ }
+
+ // Quantization pass:
+ for (i = 0; i < non_zero_regs; i++) {
+ __m128i coeffs, coeffs_sign, tmp1, tmp2;
+ int test;
+ int abs_coeff[4];
+ int coeff_sign[4];
+
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ coeffs_sign = _mm_srai_epi32(coeffs, 31);
+ coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+ tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+ tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ test = _mm_movemask_epi8(tmp1);
+ _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+ _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+ for (j = 0; j < 4; j++) {
+ if (test & (1 << (4 * j))) {
+ int k = 4 * i + j;
+ const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+ const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+ qcoeff_ptr[k] =
+ (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
+ dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+ if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+ }
+ }
+ }
+ *eob_ptr = eob_i;
+}
+
+void vpx_highbd_quantize_b_32x32_sse2(
+ const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+ __m128i zbins[2];
+ __m128i nzbins[2];
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = 0;
+ const intptr_t n_coeffs = 32 * 32;
+ const int16_t *iscan = scan_order->iscan;
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
+
+ zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+ zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+ }
+ *eob_ptr = eob;
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
new file mode 100644
index 0000000000..e483fdce73
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h> // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+ uint32_t sad_array[4]) {
+ const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+ const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+ const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+ const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+ _mm256_extractf128_si256(t2, 1));
+ _mm_storeu_si128((__m128i *)sad_array, sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ int x;
+
+ for (x = 0; x < 4; ++x) {
+ __m256i r[4];
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[x]);
+ r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32));
+ r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48));
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3));
+
+ // sum every abs diff
+ sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1]));
+ sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3]));
+ }
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < (n / 2); ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);
+
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 1;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD64XNX4D(n) \
+ void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
+ }
+
+#define HIGHBD_SADSKIP64XNx4D(n) \
+ void vpx_highbd_sad_skip_64x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; i++) {
+ __m256i r[8];
+
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src);
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16));
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16));
+ r[4] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16));
+ r[6] = _mm256_loadu_si256((const __m256i *)refs[3]);
+ r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16));
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2));
+ r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s));
+ r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2));
+ r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s));
+ r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2));
+
+ // sum every abs diff
+ sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1]));
+ sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3]));
+ sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5]));
+ sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7]));
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < (n / 8); ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 3;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD32XNX4D(n) \
+ void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
+ }
+
+#define HIGHBD_SADSKIP32XNx4D(n) \
+ void vpx_highbd_sad_skip_32x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; i++) {
+ __m256i r[4];
+
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src);
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s));
+
+ // sum every abs diff
+ sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]);
+ sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]);
+ sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]);
+ sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]);
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ const int height = VPXMIN(16, n);
+ const int num_iters = n / height;
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < num_iters; ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 4;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD16XNX4D(n) \
+ void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
+ }
+
+#define HIGHBD_SADSKIP16XNx4D(n) \
+ void vpx_highbd_sad_skip_16x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4]) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+ {
+ __m256i sums_32[4];
+ sums_32[0] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+ sums_32[1] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+ sums_32[2] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+ sums_32[3] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+ calc_final_4(sums_32, sad_array);
+ }
+}
+
+void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4]) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+ {
+ __m256i sums_32[4];
+ sums_32[0] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+ sums_32[1] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+ sums_32[2] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+ sums_32[3] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+ calc_final_4(sums_32, sad_array);
+ }
+}
+
+// clang-format off
+HIGHBD_SAD64XNX4D(64)
+HIGHBD_SADSKIP64XNx4D(64)
+
+HIGHBD_SAD64XNX4D(32)
+HIGHBD_SADSKIP64XNx4D(32)
+
+HIGHBD_SAD32XNX4D(64)
+HIGHBD_SADSKIP32XNx4D(64)
+
+HIGHBD_SAD32XNX4D(32)
+HIGHBD_SADSKIP32XNx4D(32)
+
+HIGHBD_SAD32XNX4D(16)
+HIGHBD_SADSKIP32XNx4D(16)
+
+HIGHBD_SAD16XNX4D(32)
+HIGHBD_SADSKIP16XNx4D(32)
+
+HIGHBD_SADSKIP16XNx4D(16)
+
+HIGHBD_SADSKIP16XNx4D(8)
+ // clang-format on
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 0000000000..a07892d811
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -0,0 +1,326 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+ movh m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m4, [ref1q+%5*2]
+ movhps m5, [ref2q+%5*2]
+ movhps m6, [ref3q+%5*2]
+ movhps m7, [ref4q+%5*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ movu m2, [ref1q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m2, [ref1q+%5*2]
+ mova m3, m0
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m4, m2
+
+ movu m2, [ref2q+%3*2]
+ mova m3, m0
+ movhps m2, [ref2q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m5, m2
+
+ movu m2, [ref3q+%3*2]
+ mova m3, m0
+ movhps m2, [ref3q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m6, m2
+
+ movu m2, [ref4q+%3*2]
+ mova m3, m0
+ movhps m2, [ref4q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+ ; 1st 8 px
+ mova m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ mova m3, m0
+ movu m2, [ref1q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+
+ ; 2nd 8 px
+ mova m0, [srcq +(%4)*2]
+ mova m3, m0
+ movu m2, [ref1q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+ HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+ HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+ HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+ HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+ HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+ HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+; Macro Arguments:
+; 1: Width
+; 2: Height
+; 3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0 ; normal sad
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+%else ; %3 == 2, downsample
+%if UNIX64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif ;
+%endif ; sad/avg/skip
+
+; set m1
+ push srcq
+ mov srcd, 0x00010001
+ movd m1, srcd
+ pshufd m1, m1, 0x0
+ pop srcq
+
+%if %3 == 2 ; skip rows
+ lea src_strided, [2*src_strided]
+ lea ref_strided, [2*ref_strided]
+%endif ; skip rows
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+ shl srcq, 1
+ shl ref2q, 1
+ shl ref3q, 1
+ shl ref4q, 1
+ shl ref1q, 1
+
+ HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%if %3 == 2 ; Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+%undef rep
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+ ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+ ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+ movhlps m0, m4
+ movhlps m1, m5
+ movhlps m2, m6
+ movhlps m3, m7
+ paddd m4, m0
+ paddd m5, m1
+ paddd m6, m2
+ paddd m7, m3
+ punpckldq m4, m5
+ punpckldq m6, m7
+ movhlps m0, m4
+ movhlps m1, m6
+ paddd m4, m0
+ paddd m6, m1
+ punpcklqdq m4, m6
+%if %3 == 2 ; skip rows
+ pslld m4, 1
+%endif
+ movifnidn r4, r4mp
+ movu [r4], m4
+ RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16, 8
+HIGH_SADNXN4D 8, 16
+HIGH_SADNXN4D 8, 8
+HIGH_SADNXN4D 8, 4
+HIGH_SADNXN4D 4, 8
+HIGH_SADNXN4D 4, 4
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16, 8, 2
+HIGH_SADNXN4D 8, 16, 2
+HIGH_SADNXN4D 8, 8, 2
+HIGH_SADNXN4D 4, 8, 2
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 0000000000..78f8eb8bfa
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
+ const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
+ const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
+ const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
+ _mm256_extractf128_si256(t1, 1));
+ return (unsigned int)_mm_cvtsi128_si32(sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
+ const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
+ // sum every abs diff
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < (n / 2); ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);
+
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
+ }
+ return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD64XN(n) \
+ unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
+
+#define HIGHBD_SADSKIP64xN(n) \
+ unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
+
+static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < (n / 8); ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 3;
+ ref += ref_stride << 3;
+ }
+ return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD32XN(n) \
+ unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
+
+#define HIGHBD_SADSKIP32xN(n) \
+ unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
+
+static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; i += 2) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
+ }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ const int height = VPXMIN(16, n);
+ const int num_iters = n / height;
+ int i;
+
+ for (i = 0; i < num_iters; ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 4;
+ ref += ref_stride << 4;
+ }
+ return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD16XN(n) \
+ unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
+
+#define HIGHBD_SADSKIP16xN(n) \
+ unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
+
+unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+// clang-format off
+HIGHBD_SAD64XN(64)
+HIGHBD_SADSKIP64xN(64)
+HIGHBD_SAD64XN(32)
+HIGHBD_SADSKIP64xN(32)
+HIGHBD_SAD32XN(64)
+HIGHBD_SADSKIP32xN(64)
+HIGHBD_SAD32XN(32)
+HIGHBD_SADSKIP32xN(32)
+HIGHBD_SAD32XN(16)
+HIGHBD_SADSKIP32xN(16)
+HIGHBD_SAD16XN(32)
+HIGHBD_SADSKIP16xN(32)
+HIGHBD_SADSKIP16xN(16)
+HIGHBD_SADSKIP16xN(8)
+//clang-format on
+
+// AVG -------------------------------------------------------------------------
+static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
+ const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ const __m256i avg2 = _mm256_avg_epu16(r2, x2);
+ const __m256i avg3 = _mm256_avg_epu16(r3, x3);
+ // absolute differences between every ref/pred avg to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
+ const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
+ // sum every abs diff
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+ src += src_stride;
+ ref += ref_stride;
+ sec += 64;
+ }
+}
+
+#define HIGHBD_SAD64XN_AVG(n) \
+ unsigned int vpx_highbd_sad64x##n##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 2); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
+ \
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 1; \
+ ref += ref_stride << 1; \
+ sec += 64 << 1; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 64x64
+HIGHBD_SAD64XN_AVG(64)
+
+// 64x32
+HIGHBD_SAD64XN_AVG(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ // absolute differences between every ref/pred avg to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride;
+ ref += ref_stride;
+ sec += 32;
+ }
+}
+
+#define HIGHBD_SAD32XN_AVG(n) \
+ unsigned int vpx_highbd_sad32x##n##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 8); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
+ \
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 3; \
+ ref += ref_stride << 3; \
+ sec += 32 << 3; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 32x64
+HIGHBD_SAD32XN_AVG(64)
+
+// 32x32
+HIGHBD_SAD32XN_AVG(32)
+
+// 32x16
+HIGHBD_SAD32XN_AVG(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; i += 2) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
+ sec += 32;
+ }
+}
+
+unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 4;
+ ref += ref_stride << 4;
+ sec += 16 << 4;
+ }
+ return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 0000000000..62ad2237ff
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -0,0 +1,416 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%elif %4 == 1 ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if VPX_ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%else ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2 ; double the stride if we are skipping rows
+ lea src_strided, [src_strided*2]
+ lea ref_strided, [ref_strided*2]
+%endif
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+ shl srcq, 1
+ shl refq, 1
+%if %4 == 1
+ shl second_predq, 1
+%endif
+%endmacro
+
+; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+ HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ ; first half of each row
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ paddd m0, m1
+ paddd m0, m3
+ ; second half of each row
+ movu m1, [refq+64]
+ movu m2, [refq+80]
+ movu m3, [refq+96]
+ movu m4, [refq+112]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq+64]
+ psubusw m5, m1
+ psubusw m1, [srcq+64]
+ por m1, m5
+ mova m5, [srcq+80]
+ psubusw m5, m2
+ psubusw m2, [srcq+80]
+ por m2, m5
+ mova m5, [srcq+96]
+ psubusw m5, m3
+ psubusw m3, [srcq+96]
+ por m3, m5
+ mova m5, [srcq+112]
+ psubusw m5, m4
+ psubusw m4, [srcq+112]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
+
+
+; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+ HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
+
+; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+ HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/4
+%else
+ mov n_rowsd, %1/2
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+16]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*2+16]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*2]
+ por m3, m5
+ mova m5, [srcq+src_strideq*2+16]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_strideq*2+16]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2
+
+; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+ HIGH_SAD_FN 8, %1, 7, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq*2]
+ movu m3, [refq+ref_strideq*4]
+ movu m4, [refq+ref_stride3q*2]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m2
+ psubusw m2, [srcq+src_strideq*2]
+ por m2, m5
+ mova m5, [srcq+src_strideq*4]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*4]
+ por m3, m5
+ mova m5, [srcq+src_stride3q*2]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_stride3q*2]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*8]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*8]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
new file mode 100644
index 0000000000..5a3a2818de
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -0,0 +1,1021 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+SECTION .text
+
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *ref, ptrdiff_t ref_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ mova %4, %3 ; make copies to manipulate to calc sum
+ mova %2, %1 ; use originals for calc sse
+ pmaddwd %3, %3
+ paddw %4, %2
+ pmaddwd %1, %1
+ movhlps %2, %4
+ paddd %6, %3
+ paddw %4, %2
+ pxor %2, %2
+ pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
+ punpcklwd %4, %2 ; sign-extend word to dword
+ paddd %6, %1
+ paddd %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ movhlps m3, m7
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ pshufd m4, m6, 0x1
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ movd [r1], m7 ; store sse
+ movd eax, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+ add srcq, src_stridemp
+%else
+ lea srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%if VPX_ARCH_X86_64
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, \
+ second_pred, second_stride, height, sse
+ %define second_str second_strideq
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, height, sse
+ %endif
+ %define block_height heightd
+ %define bilin_filter sseq
+%else
+ %if CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, \
+ second_pred, second_stride, height, sse
+ %define block_height dword heightm
+ %define second_str second_stridemp
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, height, sse
+ %define block_height heightd
+ %endif
+
+ ; reuse argument stack space
+ %define g_bilin_filterm x_offsetm
+ %define g_pw_8m y_offsetm
+
+ ; Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, \
+ second_pred, second_stride, height, sse
+ %define block_height dword heightm
+ %define second_str second_stridemp
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, height, sse
+ %define block_height heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+
+%if %1 < 16
+ sar block_height, 1
+%endif
+%if %2 == 1 ; avg
+ shl second_str, 1
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq + 16]
+ mova m1, [refq]
+ mova m3, [refq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m2, [second_predq+16]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq + src_strideq*2]
+ mova m1, [refq]
+ mova m3, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m2, [second_predq]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [refq]
+ mova m3, [refq+16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m2, [refq]
+ mova m3, [refq+ref_strideq*2]
+ pavgw m0, m1
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m1, [second_predq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_half_loop
+ STORE_AND_RET
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [refq]
+ mova m3, [refq+16]
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m4, m1
+ mova m2, [refq]
+ mova m3, [refq+ref_strideq*2]
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m1, [second_predq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonzero:
+ cmp x_offsetd, 8
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ mova m2, [refq]
+ mova m3, [refq + 16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ mova m2, [refq]
+ mova m3, [refq + ref_strideq*2]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m1, [second_predq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_zero_loop
+ STORE_AND_RET
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m1, m3
+ mova m4, [refq]
+ mova m5, [refq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m2, m3
+ mova m4, [refq]
+ mova m5, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m2, [second_predq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_half_loop
+ STORE_AND_RET
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86_32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m1, filter_rnd
+ paddw m1, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m1, 4
+ paddw m0, m2
+ mova m2, [refq]
+ psrlw m0, 4
+ mova m3, [refq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m4, filter_rnd
+ paddw m4, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m4, 4
+ paddw m0, m2
+ mova m2, [refq]
+ psrlw m0, 4
+ mova m3, [refq+ref_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m4, [second_predq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ mova m4, [refq]
+ mova m5, [refq+16]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m2, [srcq+2]
+ movu m3, [srcq+src_strideq*2+2]
+ mova m4, [refq]
+ mova m5, [refq+ref_strideq*2]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m1, [second_predq]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*4]
+ lea refq, [refq+ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [refq]
+ mova m5, [refq+16]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m1, m3
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [refq]
+ mova m5, [refq+ref_strideq*2]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m2, m3
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m2, [second_predq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq+src_strideq*4]
+ lea refq, [refq+ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [bilin_filter+y_offsetq]
+ mova m11, [bilin_filter+y_offsetq+16]
+ mova m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+; end of load filter
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ movu m1, [srcq+16]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ movu m3, [srcq+16]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m1, filter_rnd
+ mova m2, [refq]
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova m3, [refq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea refq, [refq + ref_strideq * 2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ INC_SRC_BY_SRC_STRIDE
+ movu m3, [srcq]
+ movu m5, [srcq+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m4, filter_rnd
+ mova m2, [refq]
+ paddw m4, m3
+ psrlw m0, 4
+ psrlw m4, 4
+ mova m3, [refq+ref_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m4, [second_predq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea refq, [refq + ref_strideq * 4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
new file mode 100644
index 0000000000..5bee51fa0c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
@@ -0,0 +1,315 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;unsigned int vpx_highbd_calc16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int src_stride,
+; unsigned char * ref_ptr,
+; int ref_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+globalsym(vpx_highbd_calc16x16var_sse2)
+sym(vpx_highbd_calc16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[src_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+16]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax+16]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+16]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+.var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 2
+ jnz .var16loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vpx_highbd_calc8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int src_stride,
+; unsigned char * ref_ptr,
+; int ref_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+globalsym(vpx_highbd_calc8x8var_sse2)
+sym(vpx_highbd_calc8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[src_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 8
+
+.var8loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rbx+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ lea rbx, [rbx+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 4
+ jnz .var8loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 0000000000..381e0ad193
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+#define HIGH_GET_VAR(S) \
+ void vpx_highbd_8_get##S##x##S##var_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ } \
+ \
+ void vpx_highbd_10_get##S##x##S##var_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+ } \
+ \
+ void vpx_highbd_12_get##S##x##S##var_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+ }
+
+HIGH_GET_VAR(16)
+HIGH_GET_VAR(8)
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+ uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_8_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> (shift)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_12_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
+
+#undef VAR_FN
+
+unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ vpx_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ vpx_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ vpx_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ vpx_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ vpx_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ vpx_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
+#define DECL(w, opt) \
+ int vpx_highbd_sub_pixel_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *ref, ptrdiff_t ref_stride, int height, \
+ unsigned int *sse, void *unused0, void *unused);
+#define DECLS(opt) \
+ DECL(8, opt) \
+ DECL(16, opt)
+
+DECLS(sse2)
+
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \
+ NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \
+ int64_t var; \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \
+ NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ int64_t var; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, y_offset, \
+ ref + (start_row * ref_stride), ref_stride, height, &sse2, NULL, \
+ NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 16 + (start_row * ref_stride), ref_stride, height, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \
+ height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \
+ height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define FNS(opt) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int64_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int64_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int64_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int64_t))
+
+FNS(sse2)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *ref, ptrdiff_t ref_stride, const uint16_t *second, \
+ ptrdiff_t second_stride, int height, unsigned int *sse, void *unused0, \
+ void *unused);
+#define DECLS(opt1) \
+ DECL(16, opt1) \
+ DECL(8, opt1)
+
+DECLS(sse2)
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \
+ sec + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \
+ sec + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \
+ sec + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int64_t var; \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \
+ sec + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \
+ sec + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \
+ sec + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int start_row; \
+ int64_t var; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, y_offset, \
+ ref + (start_row * ref_stride), ref_stride, sec + (start_row * w), \
+ w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 16 + (start_row * ref_stride), ref_stride, \
+ sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \
+ sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \
+ sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define FNS(opt1) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt1, (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt1, (int64_t)) \
+ FN(8, 16, 8, 4, 3, opt1, (int64_t)) \
+ FN(8, 8, 8, 3, 3, opt1, (int64_t)) \
+ FN(8, 4, 8, 3, 2, opt1, (int64_t))
+
+FNS(sse2)
+
+#undef FNS
+#undef FN
+
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
+ int i, j;
+ if (width > 8) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 16) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
+ _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else if (width == 8) {
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
+ _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
+ comp_pred += 8 << 1;
+ pred += 8 << 1;
+ ref += ref_stride << 1;
+ }
+ } else {
+ assert(width == 4);
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
+ const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
+ _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
+ comp_pred += 4 << 1;
+ pred += 4 << 1;
+ ref += ref_stride << 1;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm
new file mode 100644
index 0000000000..61af6236ed
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm
@@ -0,0 +1,860 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pb_1: times 16 db 1
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4: times 8 dw 2
+pw2_8: times 8 dw 4
+pw2_16: times 8 dw 8
+pw2_32: times 8 dw 16
+
+SECTION .text
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+ pavgb %4, %1, %3
+ pxor %3, %1
+ pand %3, [GLOBAL(pb_1)]
+ psubb %4, %3
+ pavgb %4, %2
+%endmacro
+
+INIT_XMM sse2
+cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, temp
+ psrldq m1, m0, 1
+ psrldq m2, m0, 2
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+
+ ; store 4 lines
+ movd [dstq ], m3
+ psrlq m3, 8
+ movd [dstq+strideq ], m3
+ lea dstq, [dstq+strideq*2]
+ psrlq m3, 8
+ movd [dstq ], m3
+ psrlq m3, 8
+ movd [dstq+strideq ], m3
+ psrlq m0, 56
+ movd tempd, m0
+ mov [dstq+strideq+3], tempb
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movu m1, [aboveq]
+ pslldq m0, m1, 1
+ psrldq m2, m1, 1
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+ punpckhbw m0, m0 ; 7 7
+ punpcklwd m0, m0 ; 7 7 7 7
+ punpckldq m0, m0 ; 7 7 7 7 7 7 7 7
+ punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
+
+ ; store 4 lines
+ psrldq m3, 1
+ movq [dstq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq*2], m3
+ psrldq m3, 1
+ movq [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+
+ ; store next 4 lines
+ psrldq m3, 1
+ movq [dstq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq*2], m3
+ psrldq m3, 1
+ movq [dstq+stride3q ], m3
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
+ GET_GOT goffsetq
+
+ movd m0, [leftq] ; abcd [byte]
+ punpcklbw m4, m0, m0 ; aabb ccdd
+ punpcklwd m4, m4 ; aaaa bbbb cccc dddd
+ psrldq m4, 12 ; dddd
+ punpckldq m0, m4 ; abcd dddd
+ psrldq m1, m0, 1 ; bcdd
+ psrldq m2, m0, 2 ; cddd
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d
+ pavgb m1, m0 ; ab, bc, cd, d [byte]
+
+ punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+ movd [dstq ], m1
+ psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
+ movd [dstq+strideq], m1
+
+ lea dstq, [dstq+strideq*2]
+ psrlq m1, 16 ; cd, c3d, d, d
+ movd [dstq ], m1
+ movd [dstq+strideq], m4 ; d, d, d, d
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ movd m2, [leftq]
+ movd m0, [aboveq]
+ pxor m1, m1
+ punpckldq m0, m2
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw_4)]
+ psraw m0, 3
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [leftq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [aboveq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ movq m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_8)]
+ psraw m0, 4
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movd m0, [GLOBAL(dc_128)]
+ movd [dstq ], m0
+ movd [dstq+strideq ], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [GLOBAL(dc_128)]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_16)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [leftq]
+ mova m4, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ psadbw m3, m1
+ psadbw m4, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_32)]
+ psraw m0, 6
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [leftq]
+ mova m2, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+ movd m0, [aboveq]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m1
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
+ movifnidn leftq, leftmp
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0
+ pshufd m1, m0, 0x1
+ movd [dstq ], m0
+ movd [dstq+strideq], m1
+ pshufd m2, m0, 0x2
+ lea dstq, [dstq+strideq*2]
+ pshufd m3, m0, 0x3
+ movd [dstq ], m2
+ movd [dstq+strideq], m3
+ RET
+
+INIT_XMM sse2
+cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -2
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [leftq ]
+ punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8
+.loop:
+ pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1
+ pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2
+ movq [dstq ], m1
+ movq [dstq+strideq], m2
+ pshuflw m1, m0, 0xaa
+ pshuflw m2, m0, 0xff
+ movq [dstq+strideq*2], m1
+ movq [dstq+stride3q ], m2
+ pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
+ inc lineq
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -4
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+.loop:
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
+ pshufd m1, m0, 0x0 ; l1 repeated 16 times
+ pshufd m2, m0, 0x55 ; l2 repeated 16 times
+ mova [dstq ], m1
+ mova [dstq+strideq ], m2
+ pshufd m1, m0, 0xaa
+ pshufd m2, m0, 0xff
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+ inc lineq
+ lea leftq, [leftq+4 ]
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -8
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+.loop:
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
+ pshufd m1, m0, 0x0 ; l1 repeated 16 times
+ pshufd m2, m0, 0x55 ; l2 repeated 16 times
+ mova [dstq ], m1
+ mova [dstq+16 ], m1
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq+16 ], m2
+ pshufd m1, m0, 0xaa
+ pshufd m2, m0, 0xff
+ mova [dstq+strideq*2 ], m1
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q ], m2
+ mova [dstq+stride3q+16 ], m2
+ inc lineq
+ lea leftq, [leftq+4 ]
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
+ pxor m1, m1
+ movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
+ punpcklbw m0, m1
+ pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word]
+ psrldq m0, 2
+ psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
+ movd m2, [leftq]
+ punpcklbw m2, m1
+ pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
+ pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m4, m4
+ packuswb m3, m3
+ movd [dstq ], m4
+ movd [dstq+strideq], m3
+ lea dstq, [dstq+strideq*2]
+ pshuflw m4, m2, 0xaa
+ pshuflw m3, m2, 0xff
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m4, m4
+ packuswb m3, m3
+ movd [dstq ], m4
+ movd [dstq+strideq], m3
+ RET
+
+INIT_XMM sse2
+cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
+ pxor m1, m1
+ movd m2, [aboveq-1]
+ movq m0, [aboveq]
+ punpcklbw m2, m1
+ punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
+ pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word]
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -4
+ punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word]
+ psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word]
+ movq m2, [leftq]
+ punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
+.loop:
+ pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
+ pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
+ punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
+ punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m4, m3
+ movq [dstq ], m4
+ movhps [dstq+strideq], m4
+ lea dstq, [dstq+strideq*2]
+ psrldq m2, 4
+ inc lineq
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
+ pxor m1, m1
+ mova m2, [aboveq-16];
+ mova m0, [aboveq] ; t1 t2 ... t16 [byte]
+ punpckhbw m2, m1 ; [127:112] tl [word]
+ punpckhbw m4, m0, m1
+ punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word]
+ DEFINE_ARGS dst, stride, line, left, stride8
+ mov lineq, -8
+ pshufhw m2, m2, 0xff
+ mova m3, [leftq] ; l1 l2 ... l16 [byte]
+ punpckhqdq m2, m2 ; tl repeated 8 times [word]
+ psubw m0, m2
+ psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word]
+ punpckhbw m5, m3, m1
+ punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word]
+ lea stride8q, [strideq*8]
+.loop:
+ pshuflw m6, m3, 0x0
+ pshuflw m7, m5, 0x0
+ punpcklqdq m6, m6 ; l1 repeated 8 times [word]
+ punpcklqdq m7, m7 ; l8 repeated 8 times [word]
+ paddw m1, m6, m0
+ paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word]
+ psrldq m5, 2
+ packuswb m1, m6
+ mova [dstq ], m1
+ paddw m1, m7, m0
+ paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word]
+ psrldq m3, 2
+ packuswb m1, m7
+ mova [dstq+stride8q], m1
+ inc lineq
+ lea dstq, [dstq+strideq]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
+ pxor m1, m1
+ movd m2, [aboveq-1]
+ mova m0, [aboveq]
+ mova m4, [aboveq+16]
+ punpcklbw m2, m1
+ punpckhbw m3, m0, m1
+ punpckhbw m5, m4, m1
+ punpcklbw m0, m1
+ punpcklbw m4, m1
+ pshuflw m2, m2, 0x0
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -16
+ punpcklqdq m2, m2
+ add leftq, 32
+ psubw m0, m2
+ psubw m3, m2
+ psubw m4, m2
+ psubw m5, m2
+.loop:
+ movd m2, [leftq+lineq*2]
+ pxor m1, m1
+ punpcklbw m2, m1
+ pshuflw m7, m2, 0x55
+ pshuflw m2, m2, 0x0
+ punpcklqdq m2, m2
+ punpcklqdq m7, m7
+ paddw m6, m2, m3
+ paddw m1, m2, m0
+ packuswb m1, m6
+ mova [dstq ], m1
+ paddw m6, m2, m5
+ paddw m1, m2, m4
+ packuswb m1, m6
+ mova [dstq+16 ], m1
+ paddw m6, m7, m3
+ paddw m1, m7, m0
+ packuswb m1, m6
+ mova [dstq+strideq ], m1
+ paddw m6, m7, m5
+ paddw m1, m7, m4
+ packuswb m1, m6
+ mova [dstq+strideq+16], m1
+ lea dstq, [dstq+strideq*2]
+ inc lineq
+ jnz .loop
+ REP_RET
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
new file mode 100644
index 0000000000..5e0139fa8d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
@@ -0,0 +1,871 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
+sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+SECTION .text
+
+INIT_XMM ssse3
+cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, dst8, line
+ lea stride3q, [strideq*3]
+ lea dst8q, [dstq+strideq*8]
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+ pavgb m3, m2, m0
+ pxor m2, m0
+ pshufb m0, m1
+ pand m2, [GLOBAL(pb_1)]
+ psubb m3, m2
+ pavgb m0, m3
+
+ ; first 4 lines and first half of 3rd 4 lines
+ mov lined, 2
+.loop:
+ mova [dstq ], m0
+ movhps [dst8q ], m0
+ pshufb m0, m1
+ mova [dstq +strideq ], m0
+ movhps [dst8q+strideq ], m0
+ pshufb m0, m1
+ mova [dstq +strideq*2 ], m0
+ movhps [dst8q+strideq*2 ], m0
+ pshufb m0, m1
+ mova [dstq +stride3q ], m0
+ movhps [dst8q+stride3q ], m0
+ pshufb m0, m1
+ lea dstq, [dstq +strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ dec lined
+ jnz .loop
+
+ ; bottom-right 8x8 block
+ movhps [dstq +8], m0
+ movhps [dstq+strideq +8], m0
+ movhps [dstq+strideq*2+8], m0
+ movhps [dstq+stride3q +8], m0
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq +8], m0
+ movhps [dstq+strideq +8], m0
+ movhps [dstq+strideq*2+8], m0
+ movhps [dstq+stride3q +8], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ mova m4, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, dst16, line
+ lea stride3q, [strideq*3]
+ lea dst16q, [dstq +strideq*8]
+ lea dst16q, [dst16q+strideq*8]
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)]
+ pavgb m3, m2, m4
+ pxor m2, m4
+ palignr m5, m4, m0, 1
+ palignr m6, m4, m0, 2
+ pshufb m4, m1
+ pand m2, [GLOBAL(pb_1)]
+ psubb m3, m2
+ pavgb m4, m3
+ pavgb m3, m0, m6
+ pxor m0, m6
+ pand m0, [GLOBAL(pb_1)]
+ psubb m3, m0
+ pavgb m5, m3
+
+ ; write 4x4 lines (and the first half of the second 4x4 lines)
+ mov lined, 4
+.loop:
+ mova [dstq ], m5
+ mova [dstq +16], m4
+ mova [dst16q ], m4
+ palignr m3, m4, m5, 1
+ pshufb m4, m1
+ mova [dstq +strideq ], m3
+ mova [dstq +strideq +16], m4
+ mova [dst16q+strideq ], m4
+ palignr m5, m4, m3, 1
+ pshufb m4, m1
+ mova [dstq +strideq*2 ], m5
+ mova [dstq +strideq*2+16], m4
+ mova [dst16q+strideq*2 ], m4
+ palignr m3, m4, m5, 1
+ pshufb m4, m1
+ mova [dstq +stride3q ], m3
+ mova [dstq +stride3q +16], m4
+ mova [dst16q+stride3q ], m4
+ palignr m5, m4, m3, 1
+ pshufb m4, m1
+ lea dstq, [dstq +strideq*4]
+ lea dst16q, [dst16q+strideq*4]
+ dec lined
+ jnz .loop
+
+ ; write second half of second 4x4 lines
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+ lea dstq, [dstq +strideq*4]
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+ lea dstq, [dstq +strideq*4]
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+ lea dstq, [dstq +strideq*4]
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+
+ RESTORE_GOT
+ RET
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+ pavgb %4, %1, %3
+ pxor %3, %1
+ pand %3, [GLOBAL(pb_1)]
+ psubb %4, %3
+ pavgb %4, %2
+%endmacro
+
+INIT_XMM ssse3
+cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movq m3, [aboveq]
+ pshufb m1, m3, [GLOBAL(sh_b23456777)]
+ pshufb m2, m3, [GLOBAL(sh_b12345677)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
+ pavgb m3, m2
+
+ ; store 4 lines
+ movd [dstq ], m3
+ movd [dstq+strideq], m4
+ lea dstq, [dstq+strideq*2]
+ psrldq m3, 1
+ psrldq m4, 1
+ movd [dstq ], m3
+ movd [dstq+strideq], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movq m3, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
+ pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
+ pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
+ pshufb m3, [GLOBAL(sh_b0123456777777777)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
+ pavgb m3, m2
+
+ ; store 4 lines
+ movq [dstq ], m3
+ movq [dstq+strideq], m4
+ psrldq m3, 1
+ psrldq m4, 1
+ movq [dstq+strideq*2], m3
+ movq [dstq+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ psrldq m3, 1
+ psrldq m4, 1
+
+ ; store 4 lines
+ movq [dstq ], m3
+ movq [dstq+strideq], m4
+ psrldq m3, 1
+ psrldq m4, 1
+ movq [dstq+strideq*2], m3
+ movq [dstq+stride3q ], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, line
+ lea stride3q, [strideq*3]
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+ pshufb m3, m0, m1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
+ pavgb m0, m3
+
+ mov lined, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m4
+ pshufb m0, m1
+ pshufb m4, m1
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m4
+ pshufb m0, m1
+ pshufb m4, m1
+ lea dstq, [dstq+strideq*4]
+ dec lined
+ jnz .loop
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ mova m7, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, line
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ lea stride3q, [strideq*3]
+ pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)]
+ pshufb m3, m7, m1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
+ palignr m6, m7, m0, 1
+ palignr m5, m7, m0, 2
+ pavgb m7, m3
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
+ pavgb m0, m6
+
+ mov lined, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m7
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq +16], m4
+ palignr m3, m7, m0, 1
+ palignr m5, m4, m2, 1
+ pshufb m7, m1
+ pshufb m4, m1
+
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m5
+ mova [dstq+stride3q +16], m4
+ palignr m0, m7, m3, 1
+ palignr m2, m4, m5, 1
+ pshufb m7, m1
+ pshufb m4, m1
+ lea dstq, [dstq+strideq*4]
+ dec lined
+ jnz .loop
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ movd m0, [leftq] ; l1, l2, l3, l4
+ movd m1, [aboveq-1] ; tl, t1, t2, t3
+ punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3
+ pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
+ psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3
+ psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1
+ ; A2 B2 A1 B1
+ ; A3 B3 A2 B2
+ ; A4 B4 A3 B3
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1
+ pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1
+
+ punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+stride3q ], m3
+ psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+strideq*2], m3
+ psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+strideq ], m3
+ psrldq m3, 2 ; A1 B1 C1 D1 ..
+ movd [dstq ], m3
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ movq m0, [leftq] ; [0- 7] l1-8 [byte]
+ movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte]
+ pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word]
+ pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word]
+ pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word]
+ pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word]
+ psrldq m4, m0, 1 ; t1-7 [word]
+ psrldq m5, m0, 2 ; t2-7 [word]
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1 E1 F1 G1 H1
+ ; A2 B2 A1 B1 C1 D1 E1 F1
+ ; A3 B3 A2 B2 A1 B1 C1 D1
+ ; A4 B4 A3 B3 A2 B2 A1 B1
+ ; A5 B5 A4 B4 A3 B3 A2 B2
+ ; A6 B6 A5 B5 A4 B4 A3 B3
+ ; A7 B7 A6 B6 A5 B5 A4 B4
+ ; A8 B8 A7 B7 A6 B6 A5 B5
+ pavgb m6, m1, m2 ; 2-tap avg A8-A1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1
+
+ punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1
+ palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2 ; A-B2, A-B1, C-H1
+ movq [dstq+strideq ], m0
+ psrldq m0, 2 ; A-H1
+ movq [dstq ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5
+ psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4
+ movq [dstq+strideq*2], m6
+ psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3
+ movq [dstq+strideq ], m6
+ psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2
+ movq [dstq ], m6
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ mova m0, [leftq]
+ movu m7, [aboveq-1]
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
+ ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
+ ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
+ ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
+ ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
+ ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
+ ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
+ ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
+ ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
+ ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
+ ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
+ ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
+ ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
+ ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
+ ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
+ ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
+ pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)]
+ palignr m5, m0, m6, 15
+ palignr m3, m0, m6, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
+ pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)]
+ pavgb m5, m0 ; A1 - Ag
+
+ punpcklbw m0, m4, m5 ; A-B8 ... A-B1
+ punpckhbw m4, m5 ; A-B9 ... A-Bg
+
+ pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1
+
+ pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ palignr m2, m1, m6, 14
+ mova [dstq ], m2
+ palignr m2, m1, m6, 12
+ mova [dstq+strideq ], m2
+ palignr m2, m1, m6, 10
+ mova [dstq+strideq*2], m2
+ palignr m2, m1, m6, 8
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ palignr m2, m1, m6, 6
+ mova [dstq ], m2
+ palignr m2, m1, m6, 4
+ mova [dstq+strideq ], m2
+ palignr m2, m1, m6, 2
+ mova [dstq+strideq*2], m2
+ pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
+ mova [dstq+stride3q ], m6
+ lea dstq, [dstq+strideq*4]
+
+ palignr m2, m6, m4, 14
+ mova [dstq ], m2
+ palignr m2, m6, m4, 12
+ mova [dstq+strideq ], m2
+ palignr m2, m6, m4, 10
+ mova [dstq+strideq*2], m2
+ palignr m2, m6, m4, 8
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ palignr m2, m6, m4, 6
+ mova [dstq ], m2
+ palignr m2, m6, m4, 4
+ mova [dstq+strideq ], m2
+ palignr m2, m6, m4, 2
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ mova m0, [leftq]
+ movu m7, [aboveq-1]
+ movu m1, [aboveq+15]
+
+ pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high]
+
+ palignr m3, m1, m7, 1
+ palignr m5, m1, m7, 2
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low]
+
+ pshufb m7, [GLOBAL(sh_bfedcba9876543210)]
+ palignr m5, m0, m7, 15
+ palignr m3, m0, m7, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
+ pavgb m5, m0 ; A1 - Ag
+ punpcklbw m6, m4, m5 ; A-B8 ... A-B1
+ punpckhbw m4, m5 ; A-B9 ... A-Bg
+ pshufb m6, [GLOBAL(sh_bfedcba9876543210)]
+ pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
+
+ DEFINE_ARGS dst, stride, stride3, left, line
+ lea stride3q, [strideq*3]
+
+ palignr m5, m2, m1, 14
+ palignr m7, m1, m6, 14
+ mova [dstq ], m7
+ mova [dstq+16 ], m5
+ palignr m5, m2, m1, 12
+ palignr m7, m1, m6, 12
+ mova [dstq+strideq ], m7
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m2, m1, 10
+ palignr m7, m1, m6, 10
+ mova [dstq+strideq*2 ], m7
+ mova [dstq+strideq*2+16], m5
+ palignr m5, m2, m1, 8
+ palignr m7, m1, m6, 8
+ mova [dstq+stride3q ], m7
+ mova [dstq+stride3q+16 ], m5
+ lea dstq, [dstq+strideq*4]
+ palignr m5, m2, m1, 6
+ palignr m7, m1, m6, 6
+ mova [dstq ], m7
+ mova [dstq+16 ], m5
+ palignr m5, m2, m1, 4
+ palignr m7, m1, m6, 4
+ mova [dstq+strideq ], m7
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m2, m1, 2
+ palignr m7, m1, m6, 2
+ mova [dstq+strideq*2 ], m7
+ mova [dstq+strideq*2+16], m5
+ mova [dstq+stride3q ], m6
+ mova [dstq+stride3q+16 ], m1
+ lea dstq, [dstq+strideq*4]
+
+ palignr m5, m1, m6, 14
+ palignr m3, m6, m4, 14
+ mova [dstq ], m3
+ mova [dstq+16 ], m5
+ palignr m5, m1, m6, 12
+ palignr m3, m6, m4, 12
+ mova [dstq+strideq ], m3
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m1, m6, 10
+ palignr m3, m6, m4, 10
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m5
+ palignr m5, m1, m6, 8
+ palignr m3, m6, m4, 8
+ mova [dstq+stride3q ], m3
+ mova [dstq+stride3q+16 ], m5
+ lea dstq, [dstq+strideq*4]
+ palignr m5, m1, m6, 6
+ palignr m3, m6, m4, 6
+ mova [dstq ], m3
+ mova [dstq+16 ], m5
+ palignr m5, m1, m6, 4
+ palignr m3, m6, m4, 4
+ mova [dstq+strideq ], m3
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m1, m6, 2
+ palignr m3, m6, m4, 2
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m5
+ mova [dstq+stride3q ], m4
+ mova [dstq+stride3q+16 ], m6
+ lea dstq, [dstq+strideq*4]
+
+ mova m7, [leftq]
+ mova m3, [leftq+16]
+ palignr m5, m3, m7, 15
+ palignr m0, m3, m7, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh -
+ pavgb m5, m3 ; Ah -
+ punpcklbw m3, m2, m5 ; A-B8 ... A-B1
+ punpckhbw m2, m5 ; A-B9 ... A-Bg
+ pshufb m3, [GLOBAL(sh_bfedcba9876543210)]
+ pshufb m2, [GLOBAL(sh_bfedcba9876543210)]
+
+ palignr m7, m6, m4, 14
+ palignr m0, m4, m3, 14
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m6, m4, 12
+ palignr m0, m4, m3, 12
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m6, m4, 10
+ palignr m0, m4, m3, 10
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ palignr m7, m6, m4, 8
+ palignr m0, m4, m3, 8
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q+16 ], m7
+ lea dstq, [dstq+strideq*4]
+ palignr m7, m6, m4, 6
+ palignr m0, m4, m3, 6
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m6, m4, 4
+ palignr m0, m4, m3, 4
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m6, m4, 2
+ palignr m0, m4, m3, 2
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m3
+ mova [dstq+stride3q+16 ], m4
+ lea dstq, [dstq+strideq*4]
+
+ palignr m7, m4, m3, 14
+ palignr m0, m3, m2, 14
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m4, m3, 12
+ palignr m0, m3, m2, 12
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m4, m3, 10
+ palignr m0, m3, m2, 10
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ palignr m7, m4, m3, 8
+ palignr m0, m3, m2, 8
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q+16 ], m7
+ lea dstq, [dstq+strideq*4]
+ palignr m7, m4, m3, 6
+ palignr m0, m3, m2, 6
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m4, m3, 4
+ palignr m0, m3, m2, 4
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m4, m3, 2
+ palignr m0, m3, m2, 2
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m2
+ mova [dstq+stride3q+16 ], m3
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ movq m3, [leftq] ; abcdefgh [byte]
+ lea stride3q, [strideq*3]
+
+ pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
+ pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
+ pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
+ pavgb m0, m2
+ punpcklbw m0, m3 ; interleaved output
+
+ movq [dstq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
+ psrldq m0, 2
+ movq [dstq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2
+ movq [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ lea stride3q, [strideq*3]
+ mova m0, [leftq] ; abcdefghijklmnop [byte]
+ pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
+ pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+ pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte]
+
+ punpckhbw m4, m1, m3 ; interleaved input
+ punpcklbw m1, m3 ; interleaved output
+ mova [dstq ], m1
+ palignr m3, m4, m1, 2
+ mova [dstq+strideq ], m3
+ palignr m3, m4, m1, 4
+ mova [dstq+strideq*2], m3
+ palignr m3, m4, m1, 6
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ palignr m3, m4, m1, 8
+ mova [dstq ], m3
+ palignr m3, m4, m1, 10
+ mova [dstq+strideq ], m3
+ palignr m3, m4, m1, 12
+ mova [dstq+strideq*2], m3
+ palignr m3, m4, m1, 14
+ mova [dstq+stride3q ], m3
+ DEFINE_ARGS dst, stride, stride3, line
+ mov lined, 2
+ mova m0, [GLOBAL(sh_b23456789abcdefff)]
+.loop:
+ lea dstq, [dstq+strideq*4]
+ mova [dstq ], m4
+ pshufb m4, m0
+ mova [dstq+strideq ], m4
+ pshufb m4, m0
+ mova [dstq+strideq*2], m4
+ pshufb m4, m0
+ mova [dstq+stride3q ], m4
+ pshufb m4, m0
+ dec lined
+ jnz .loop
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ lea stride3q, [strideq*3]
+ mova m1, [leftq] ; 0-15 [byte]
+ mova m2, [leftq+16] ; 16-31 [byte]
+ pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)]
+ pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
+ palignr m6, m2, m1, 1
+ palignr m5, m2, m1, 2
+ pavgb m2, m4 ; high 16px even lines
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
+ pavgb m1, m6 ; low 16px even lines
+
+ punpckhbw m6, m1, m0 ; interleaved output 2
+ punpcklbw m1, m0 ; interleaved output 1
+
+ punpckhbw m7, m2, m3 ; interleaved output 4
+ punpcklbw m2, m3 ; interleaved output 3
+
+ ; output 1st 8 lines (and half of 2nd 8 lines)
+ DEFINE_ARGS dst, stride, stride3, dst8
+ lea dst8q, [dstq+strideq*8]
+ mova [dstq ], m1
+ mova [dstq +16], m6
+ mova [dst8q ], m6
+ palignr m0, m6, m1, 2
+ palignr m4, m2, m6, 2
+ mova [dstq +strideq ], m0
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m0, m6, m1, 4
+ palignr m4, m2, m6, 4
+ mova [dstq +strideq*2 ], m0
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m0, m6, m1, 6
+ palignr m4, m2, m6, 6
+ mova [dstq +stride3q ], m0
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq +strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ palignr m0, m6, m1, 8
+ palignr m4, m2, m6, 8
+ mova [dstq ], m0
+ mova [dstq +16], m4
+ mova [dst8q ], m4
+ palignr m0, m6, m1, 10
+ palignr m4, m2, m6, 10
+ mova [dstq +strideq ], m0
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m0, m6, m1, 12
+ palignr m4, m2, m6, 12
+ mova [dstq +strideq*2 ], m0
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m0, m6, m1, 14
+ palignr m4, m2, m6, 14
+ mova [dstq +stride3q ], m0
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+
+ ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
+ mova [dstq +16], m2
+ mova [dst8q ], m2
+ palignr m4, m7, m2, 2
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m4, m7, m2, 4
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m4, m7, m2, 6
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ palignr m4, m7, m2, 8
+ mova [dstq +16], m4
+ mova [dst8q ], m4
+ palignr m4, m7, m2, 10
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m4, m7, m2, 12
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m4, m7, m2, 14
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+
+ ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
+ mova m0, [GLOBAL(sh_b23456789abcdefff)]
+ mova [dstq +16], m7
+ mova [dst8q ], m7
+ pshufb m7, m0
+ mova [dstq +strideq +16], m7
+ mova [dst8q+strideq ], m7
+ pshufb m7, m0
+ mova [dstq +strideq*2+16], m7
+ mova [dst8q+strideq*2 ], m7
+ pshufb m7, m0
+ mova [dstq +stride3q +16], m7
+ mova [dst8q+stride3q ], m7
+ pshufb m7, m0
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ mova [dstq +16], m7
+ mova [dst8q ], m7
+ pshufb m7, m0
+ mova [dstq +strideq +16], m7
+ mova [dst8q+strideq ], m7
+ pshufb m7, m0
+ mova [dstq +strideq*2+16], m7
+ mova [dst8q+strideq*2 ], m7
+ pshufb m7, m0
+ mova [dstq +stride3q +16], m7
+ mova [dst8q+stride3q ], m7
+ pshufb m7, m0
+ lea dstq, [dstq+strideq*4]
+
+ ; output last half of 4th 8 lines
+ mova [dstq +16], m7
+ mova [dstq +strideq +16], m7
+ mova [dstq +strideq*2+16], m7
+ mova [dstq +stride3q +16], m7
+ lea dstq, [dstq+strideq*4]
+ mova [dstq +16], m7
+ mova [dstq +strideq +16], m7
+ mova [dstq +strideq*2+16], m7
+ mova [dstq +stride3q +16], m7
+
+ ; done!
+ RESTORE_GOT
+ RET
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c
new file mode 100644
index 0000000000..752435d240
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c
@@ -0,0 +1,626 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define PAIR256_SET_EPI16(a, b) \
+ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in,
+ int stride) {
+ int i;
+ // Load 16x16 values
+ for (i = 0; i < 16; i++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride));
+ const __m128i in1 =
+ _mm_loadu_si128((const __m128i *)((input + i * stride) + 4));
+ const __m128i in2 =
+ _mm_loadu_si128((const __m128i *)((input + i * stride) + 8));
+ const __m128i in3 =
+ _mm_loadu_si128((const __m128i *)((input + i * stride) + 12));
+ const __m128i ls = _mm_packs_epi32(in0, in1);
+ const __m128i rs = _mm_packs_epi32(in2, in3);
+ in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1);
+#else
+ in[i] = _mm256_load_si256((const __m256i *)(input + i * stride));
+#endif
+ }
+}
+
+static INLINE __m256i dct_round_shift_avx2(__m256i in) {
+ const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING));
+ return _mm256_srai_epi32(t, DCT_CONST_BITS);
+}
+
+static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) {
+ const __m256i t = _mm256_madd_epi16(*in, *cospi);
+ return dct_round_shift_avx2(t);
+}
+
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1,
+ __m256i *x) {
+ const __m256i t0 = idct_madd_round_shift_avx2(in0, x);
+ const __m256i t1 = idct_madd_round_shift_avx2(in1, x);
+ return _mm256_packs_epi32(t0, t1);
+}
+
+// Multiply elements by constants and add them together.
+static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1,
+ __m256i *out0, __m256i *out1) {
+ __m256i cst0 = PAIR256_SET_EPI16(c0, -c1);
+ __m256i cst1 = PAIR256_SET_EPI16(c1, c0);
+ __m256i lo = _mm256_unpacklo_epi16(in0, in1);
+ __m256i hi = _mm256_unpackhi_epi16(in0, in1);
+ *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0);
+ *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1);
+}
+
+static INLINE void idct16_16col(__m256i *in, __m256i *out) {
+ __m256i step1[16], step2[16];
+
+ // stage 2
+ butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+ step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm256_add_epi16(step2[10], step2[11]);
+ step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm256_add_epi16(step2[14], step2[15]);
+
+ // stage 4
+ butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+ &step2[10]);
+ step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+ step1[4] = _mm256_add_epi16(step1[4], step1[5]);
+ step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+ step1[7] = _mm256_add_epi16(step1[6], step1[7]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+ butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = _mm256_add_epi16(step1[0], step1[7]);
+ step2[1] = _mm256_add_epi16(step1[1], step1[6]);
+ step2[2] = _mm256_add_epi16(step1[2], step1[5]);
+ step2[3] = _mm256_add_epi16(step1[3], step1[4]);
+ step2[4] = _mm256_sub_epi16(step1[3], step1[4]);
+ step2[5] = _mm256_sub_epi16(step1[2], step1[5]);
+ step2[6] = _mm256_sub_epi16(step1[1], step1[6]);
+ step2[7] = _mm256_sub_epi16(step1[0], step1[7]);
+ butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+ &step2[13]);
+ butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+ &step2[12]);
+
+ // stage 7
+ out[0] = _mm256_add_epi16(step2[0], step1[15]);
+ out[1] = _mm256_add_epi16(step2[1], step1[14]);
+ out[2] = _mm256_add_epi16(step2[2], step2[13]);
+ out[3] = _mm256_add_epi16(step2[3], step2[12]);
+ out[4] = _mm256_add_epi16(step2[4], step2[11]);
+ out[5] = _mm256_add_epi16(step2[5], step2[10]);
+ out[6] = _mm256_add_epi16(step2[6], step1[9]);
+ out[7] = _mm256_add_epi16(step2[7], step1[8]);
+ out[8] = _mm256_sub_epi16(step2[7], step1[8]);
+ out[9] = _mm256_sub_epi16(step2[6], step1[9]);
+ out[10] = _mm256_sub_epi16(step2[5], step2[10]);
+ out[11] = _mm256_sub_epi16(step2[4], step2[11]);
+ out[12] = _mm256_sub_epi16(step2[3], step2[12]);
+ out[13] = _mm256_sub_epi16(step2[2], step2[13]);
+ out[14] = _mm256_sub_epi16(step2[1], step1[14]);
+ out[15] = _mm256_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) {
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest)));
+ d0 = _mm256_permute4x64_epi64(d0, 0xd8);
+ d0 = _mm256_unpacklo_epi8(d0, zero);
+ d0 = _mm256_add_epi16(in_x, d0);
+ d0 = _mm256_packus_epi16(
+ d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1)));
+
+ _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0));
+}
+
+static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) {
+ const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+ __m256i out;
+ out = _mm256_adds_epi16(in, final_rounding);
+ out = _mm256_srai_epi16(out, 6);
+ recon_and_store16(dest, out);
+}
+
+static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) {
+ const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+ int j = 0;
+ while (j < 32) {
+ in[j] = _mm256_adds_epi16(in[j], final_rounding);
+ in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding);
+
+ in[j] = _mm256_srai_epi16(in[j], 6);
+ in[j + 1] = _mm256_srai_epi16(in[j + 1], 6);
+
+ recon_and_store16(dst, in[j]);
+ dst += stride;
+ recon_and_store16(dst, in[j + 1]);
+ dst += stride;
+ j += 2;
+ }
+}
+
+static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) {
+ int i;
+ __m256i t[16], u[16];
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 1) ==> (0, 1)
+ // (2, 3) ==> (2, 3)
+ // (4, 5) ==> (4, 5)
+ // (6, 7) ==> (6, 7)
+ for (i = 0; i < 4; i++) {
+ t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+ t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 2) ==> (0, 2)
+ // (1, 3) ==> (1, 3)
+ // (4, 6) ==> (4, 6)
+ // (5, 7) ==> (5, 7)
+ for (i = 0; i < 2; i++) {
+ u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+ u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+ u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+ u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 4) ==> (0, 1)
+ // (1, 5) ==> (4, 5)
+ // (2, 6) ==> (2, 3)
+ // (3, 7) ==> (6, 7)
+ for (i = 0; i < 2; i++) {
+ out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+ out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+ out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+ out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+ }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) {
+ __m256i t[16];
+
+#define LOADL(idx) \
+ t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+ t[idx] = _mm256_inserti128_si256( \
+ t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1);
+
+#define LOADR(idx) \
+ t[8 + (idx)] = \
+ _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+ t[8 + (idx)] = _mm256_inserti128_si256( \
+ t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1);
+
+ // load left 8x16
+ LOADL(0)
+ LOADL(1)
+ LOADL(2)
+ LOADL(3)
+ LOADL(4)
+ LOADL(5)
+ LOADL(6)
+ LOADL(7)
+
+ // load right 8x16
+ LOADR(0)
+ LOADR(1)
+ LOADR(2)
+ LOADR(3)
+ LOADR(4)
+ LOADR(5)
+ LOADR(6)
+ LOADR(7)
+
+ // get the top 16x8 result
+ transpose2_8x8_avx2(t, out);
+ // get the bottom 16x8 result
+ transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ __m256i in[16];
+
+ // Load 16x16 values
+ idct_load16x16(input, in, 16);
+
+ transpose_16bit_16x16_avx2(in, in);
+ idct16_16col(in, in);
+
+ transpose_16bit_16x16_avx2(in, in);
+ idct16_16col(in, in);
+
+ for (i = 0; i < 16; ++i) {
+ write_buffer_16x1(dest + i * stride, in[i]);
+ }
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm256_add_epi16(in[i], in[bound - i]);
+ out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) {
+ __m256i step1[8], step2[8];
+
+ // stage 3
+ butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+ // stage 4
+ butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ step2[4] = _mm256_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm256_add_epi16(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm256_add_epi16(step1[0], step1[7]);
+ out[1] = _mm256_add_epi16(step1[1], step1[6]);
+ out[2] = _mm256_add_epi16(step1[2], step1[5]);
+ out[3] = _mm256_add_epi16(step1[3], step1[4]);
+ out[4] = _mm256_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm256_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm256_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm256_sub_epi16(step1[0], step1[7]);
+}
+
+static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1,
+ __m256i *out) {
+ __m256i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+ &step2[13]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10],
+ &out[13]);
+ butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11],
+ &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) {
+ __m256i step1[16], step2[16];
+
+ // stage 2
+ butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm256_add_epi16(step2[11], step2[10]);
+ step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm256_add_epi16(step2[15], step2[14]);
+
+ idct32_16x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1,
+ __m256i *out) {
+ __m256i step2[32];
+
+ // stage 4
+ step2[16] = _mm256_add_epi16(step1[16], step1[19]);
+ step2[17] = _mm256_add_epi16(step1[17], step1[18]);
+ step2[18] = _mm256_sub_epi16(step1[17], step1[18]);
+ step2[19] = _mm256_sub_epi16(step1[16], step1[19]);
+ step2[20] = _mm256_sub_epi16(step1[23], step1[20]);
+ step2[21] = _mm256_sub_epi16(step1[22], step1[21]);
+ step2[22] = _mm256_add_epi16(step1[22], step1[21]);
+ step2[23] = _mm256_add_epi16(step1[23], step1[20]);
+
+ step2[24] = _mm256_add_epi16(step1[24], step1[27]);
+ step2[25] = _mm256_add_epi16(step1[25], step1[26]);
+ step2[26] = _mm256_sub_epi16(step1[25], step1[26]);
+ step2[27] = _mm256_sub_epi16(step1[24], step1[27]);
+ step2[28] = _mm256_sub_epi16(step1[31], step1[28]);
+ step2[29] = _mm256_sub_epi16(step1[30], step1[29]);
+ step2[30] = _mm256_add_epi16(step1[29], step1[30]);
+ step2[31] = _mm256_add_epi16(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+ &step1[29]);
+ butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+ &step1[28]);
+ butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+ &step1[27]);
+ butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+ &step1[26]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ out[16] = _mm256_add_epi16(step1[16], step1[23]);
+ out[17] = _mm256_add_epi16(step1[17], step1[22]);
+ out[18] = _mm256_add_epi16(step1[18], step1[21]);
+ out[19] = _mm256_add_epi16(step1[19], step1[20]);
+ step2[20] = _mm256_sub_epi16(step1[19], step1[20]);
+ step2[21] = _mm256_sub_epi16(step1[18], step1[21]);
+ step2[22] = _mm256_sub_epi16(step1[17], step1[22]);
+ step2[23] = _mm256_sub_epi16(step1[16], step1[23]);
+
+ step2[24] = _mm256_sub_epi16(step1[31], step1[24]);
+ step2[25] = _mm256_sub_epi16(step1[30], step1[25]);
+ step2[26] = _mm256_sub_epi16(step1[29], step1[26]);
+ step2[27] = _mm256_sub_epi16(step1[28], step1[27]);
+ out[28] = _mm256_add_epi16(step1[27], step1[28]);
+ out[29] = _mm256_add_epi16(step1[26], step1[29]);
+ out[30] = _mm256_add_epi16(step1[25], step1[30]);
+ out[31] = _mm256_add_epi16(step1[24], step1[31]);
+
+ // stage 7
+ butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20],
+ &out[27]);
+ butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21],
+ &out[26]);
+ butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22],
+ &out[25]);
+ butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23],
+ &out[24]);
+}
+
+static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) {
+ __m256i temp[16];
+
+ // For each 16x32 block __m256i in[32],
+ // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+ // output pixels: 0-7 in __m256i out[32]
+ idct32_1024_16x32_quarter_1(in, temp);
+
+ // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+ // output pixels: 8-15 in __m256i out[32]
+ idct32_1024_16x32_quarter_2(in, temp);
+
+ // stage 7
+ add_sub_butterfly_avx2(temp, out, 16);
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) {
+ __m256i step1[32], step2[32];
+
+ // stage 1
+ butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+ butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+ butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+ butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+
+ butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+ butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
+
+ butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+ butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+ // stage 2
+ step2[16] = _mm256_add_epi16(step1[16], step1[17]);
+ step2[17] = _mm256_sub_epi16(step1[16], step1[17]);
+ step2[18] = _mm256_sub_epi16(step1[19], step1[18]);
+ step2[19] = _mm256_add_epi16(step1[19], step1[18]);
+ step2[20] = _mm256_add_epi16(step1[20], step1[21]);
+ step2[21] = _mm256_sub_epi16(step1[20], step1[21]);
+ step2[22] = _mm256_sub_epi16(step1[23], step1[22]);
+ step2[23] = _mm256_add_epi16(step1[23], step1[22]);
+
+ step2[24] = _mm256_add_epi16(step1[24], step1[25]);
+ step2[25] = _mm256_sub_epi16(step1[24], step1[25]);
+ step2[26] = _mm256_sub_epi16(step1[27], step1[26]);
+ step2[27] = _mm256_add_epi16(step1[27], step1[26]);
+ step2[28] = _mm256_add_epi16(step1[28], step1[29]);
+ step2[29] = _mm256_sub_epi16(step1[28], step1[29]);
+ step2[30] = _mm256_sub_epi16(step1[31], step1[30]);
+ step2[31] = _mm256_add_epi16(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ idct32_16x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) {
+ __m256i temp[32];
+
+ // For each 16x32 block __m256i in[32],
+ // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+ // output pixels: 0-7 in __m256i out[32]
+ // AND
+ // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+ // output pixels: 8-15 in __m256i out[32]
+ idct32_1024_16x32_quarter_1_2(in, temp);
+
+ // For each 16x32 block __m256i in[32],
+ // Input with odd index,
+ // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ // output pixels: 16-23, 24-31 in __m256i out[32]
+ idct32_1024_16x32_quarter_3_4(in, temp);
+
+ // final stage
+ add_sub_butterfly_avx2(temp, out, 32);
+}
+
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m256i l[32], r[32], out[32], *in;
+ int i;
+
+ in = l;
+
+ for (i = 0; i < 2; i++) {
+ idct_load16x16(input, in, 32);
+ transpose_16bit_16x16_avx2(in, in);
+
+ idct_load16x16(input + 16, in + 16, 32);
+ transpose_16bit_16x16_avx2(in + 16, in + 16);
+ idct32_1024_16x32(in, in);
+
+ in = r;
+ input += 32 << 4;
+ }
+
+ for (i = 0; i < 32; i += 16) {
+ transpose_16bit_16x16_avx2(l + i, out);
+ transpose_16bit_16x16_avx2(r + i, out + 16);
+ idct32_1024_16x32(out, out);
+
+ store_buffer_16x32(out, dest, stride);
+ dest += 16;
+ }
+}
+
+// Case when only upper-left 16x16 has non-zero coeff
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m256i in[32], io[32], out[32];
+ int i;
+
+ for (i = 16; i < 32; i++) {
+ in[i] = _mm256_setzero_si256();
+ }
+
+ // rows
+ idct_load16x16(input, in, 32);
+ transpose_16bit_16x16_avx2(in, in);
+ idct32_1024_16x32(in, io);
+
+ // columns
+ for (i = 0; i < 32; i += 16) {
+ transpose_16bit_16x16_avx2(io + i, in);
+ idct32_1024_16x32(in, out);
+
+ store_buffer_16x32(out, dest, stride);
+ dest += 16;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
new file mode 100644
index 0000000000..f42b3df849
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -0,0 +1,1235 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void transpose_16bit_4(__m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+ res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i eight = _mm_set1_epi16(8);
+ __m128i in[2];
+
+ // Rows
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8);
+ idct4_sse2(in);
+
+ // Columns
+ idct4_sse2(in);
+
+ // Final round and shift
+ in[0] = _mm_add_epi16(in[0], eight);
+ in[1] = _mm_add_epi16(in[1], eight);
+ in[0] = _mm_srai_epi16(in[0], 4);
+ in[1] = _mm_srai_epi16(in[1], 4);
+
+ recon_and_store4x4_sse2(in, dest, stride);
+}
+
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+ __m128i dc_value, d[2];
+
+ a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64);
+ a = (int)dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 4);
+
+ dc_value = _mm_set1_epi16(a);
+
+ // Reconstruction and Store
+ d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+ d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+ d[0] = _mm_unpacklo_epi32(d[0],
+ _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+ d[1] = _mm_unpacklo_epi32(
+ _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+ d[0] = _mm_unpacklo_epi8(d[0], zero);
+ d[1] = _mm_unpacklo_epi8(d[1], zero);
+ d[0] = _mm_add_epi16(d[0], dc_value);
+ d[1] = _mm_add_epi16(d[1], dc_value);
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+
+ *(int *)dest = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
+}
+
+void idct4_sse2(__m128i *const in) {
+ const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ __m128i u[2];
+
+ transpose_16bit_4(in);
+ // stage 1
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+ u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]);
+ u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]);
+
+ // stage 2
+ in[0] = _mm_add_epi16(u[0], u[1]);
+ in[1] = _mm_sub_epi16(u[0], u[1]);
+ in[1] = _mm_shuffle_epi32(in[1], 0x4E);
+}
+
+void iadst4_sse2(__m128i *const in) {
+ const __m128i k__sinpi_1_3 = pair_set_epi16(sinpi_1_9, sinpi_3_9);
+ const __m128i k__sinpi_4_2 = pair_set_epi16(sinpi_4_9, sinpi_2_9);
+ const __m128i k__sinpi_2_3 = pair_set_epi16(sinpi_2_9, sinpi_3_9);
+ const __m128i k__sinpi_1_4 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+ const __m128i k__sinpi_12_n3 =
+ pair_set_epi16(sinpi_1_9 + sinpi_2_9, -sinpi_3_9);
+ __m128i u[4], v[5];
+
+ // 00 01 20 21 02 03 22 23
+ // 10 11 30 31 12 13 32 33
+ const __m128i tr0_0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi32(in[0], in[1]);
+
+ // 00 01 10 11 20 21 30 31
+ // 02 03 12 13 22 23 32 33
+ in[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ in[1] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+ v[0] = _mm_madd_epi16(in[0], k__sinpi_1_3); // s_1 * x0 + s_3 * x1
+ v[1] = _mm_madd_epi16(in[1], k__sinpi_4_2); // s_4 * x2 + s_2 * x3
+ v[2] = _mm_madd_epi16(in[0], k__sinpi_2_3); // s_2 * x0 + s_3 * x1
+ v[3] = _mm_madd_epi16(in[1], k__sinpi_1_4); // s_1 * x2 + s_4 * x3
+ v[4] = _mm_madd_epi16(in[0], k__sinpi_12_n3); // (s_1 + s_2) * x0 - s_3 * x1
+ in[0] = _mm_sub_epi16(in[0], in[1]); // x0 - x2
+ in[1] = _mm_srli_epi32(in[1], 16);
+ in[0] = _mm_add_epi16(in[0], in[1]);
+ in[0] = _mm_slli_epi32(in[0], 16); // x0 - x2 + x3
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_sub_epi32(v[2], v[3]);
+ u[2] = _mm_madd_epi16(in[0], k__sinpi_1_3);
+ u[3] = _mm_sub_epi32(v[1], v[3]);
+ u[3] = _mm_add_epi32(u[3], v[4]);
+
+ u[0] = dct_const_round_shift_sse2(u[0]);
+ u[1] = dct_const_round_shift_sse2(u[1]);
+ u[2] = dct_const_round_shift_sse2(u[2]);
+ u[3] = dct_const_round_shift_sse2(u[3]);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+}
+
+static INLINE void load_buffer_8x8(const tran_low_t *const input,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * 8);
+ in[1] = load_input_data8(input + 1 * 8);
+ in[2] = load_input_data8(input + 2 * 8);
+ in[3] = load_input_data8(input + 3 * 8);
+ in[4] = load_input_data8(input + 4 * 8);
+ in[5] = load_input_data8(input + 5 * 8);
+ in[6] = load_input_data8(input + 6 * 8);
+ in[7] = load_input_data8(input + 7 * 8);
+}
+
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i in[8];
+ int i;
+
+ // Load input data.
+ load_buffer_8x8(input, in);
+
+ // 2-D
+ for (i = 0; i < 2; i++) {
+ vpx_idct8_sse2(in);
+ }
+
+ write_buffer_8x8(in, dest, stride);
+}
+
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i io[8];
+
+ io[0] = load_input_data4(input + 0 * 8);
+ io[1] = load_input_data4(input + 1 * 8);
+ io[2] = load_input_data4(input + 2 * 8);
+ io[3] = load_input_data4(input + 3 * 8);
+
+ idct8x8_12_add_kernel_sse2(io);
+ write_buffer_8x8(io, dest, stride);
+}
+
+static INLINE void recon_and_store_8_dual(uint8_t *const dest,
+ const __m128i in_x,
+ const int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d0, d1;
+
+ d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride));
+ d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride));
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d1 = _mm_unpacklo_epi8(d1, zero);
+ d0 = _mm_add_epi16(in_x, d0);
+ d1 = _mm_add_epi16(in_x, d1);
+ d0 = _mm_packus_epi16(d0, d1);
+ _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0);
+ _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0));
+}
+
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ dc_value = _mm_set1_epi16((int16_t)a1);
+
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
+}
+
+void vpx_idct8_sse2(__m128i *const in) {
+ // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
+ transpose_16bit_8x8(in, in);
+
+ // 4-stage 1D idct8x8
+ idct8(in, in);
+}
+
+void iadst8_sse2(__m128i *const in) {
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i kZero = _mm_setzero_si128();
+ __m128i s[8], u[16], v[8], w[16];
+
+ // transpose
+ transpose_16bit_8x8(in, in);
+
+ // column transformation
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ s[0] = _mm_unpacklo_epi16(in[7], in[0]);
+ s[1] = _mm_unpackhi_epi16(in[7], in[0]);
+ s[2] = _mm_unpacklo_epi16(in[5], in[2]);
+ s[3] = _mm_unpackhi_epi16(in[5], in[2]);
+ s[4] = _mm_unpacklo_epi16(in[3], in[4]);
+ s[5] = _mm_unpackhi_epi16(in[3], in[4]);
+ s[6] = _mm_unpacklo_epi16(in[1], in[6]);
+ s[7] = _mm_unpackhi_epi16(in[1], in[6]);
+
+ u[0] = _mm_madd_epi16(s[0], k__cospi_p02_p30);
+ u[1] = _mm_madd_epi16(s[1], k__cospi_p02_p30);
+ u[2] = _mm_madd_epi16(s[0], k__cospi_p30_m02);
+ u[3] = _mm_madd_epi16(s[1], k__cospi_p30_m02);
+ u[4] = _mm_madd_epi16(s[2], k__cospi_p10_p22);
+ u[5] = _mm_madd_epi16(s[3], k__cospi_p10_p22);
+ u[6] = _mm_madd_epi16(s[2], k__cospi_p22_m10);
+ u[7] = _mm_madd_epi16(s[3], k__cospi_p22_m10);
+ u[8] = _mm_madd_epi16(s[4], k__cospi_p18_p14);
+ u[9] = _mm_madd_epi16(s[5], k__cospi_p18_p14);
+ u[10] = _mm_madd_epi16(s[4], k__cospi_p14_m18);
+ u[11] = _mm_madd_epi16(s[5], k__cospi_p14_m18);
+ u[12] = _mm_madd_epi16(s[6], k__cospi_p26_p06);
+ u[13] = _mm_madd_epi16(s[7], k__cospi_p26_p06);
+ u[14] = _mm_madd_epi16(s[6], k__cospi_p06_m26);
+ u[15] = _mm_madd_epi16(s[7], k__cospi_p06_m26);
+
+ // addition
+ w[0] = _mm_add_epi32(u[0], u[8]);
+ w[1] = _mm_add_epi32(u[1], u[9]);
+ w[2] = _mm_add_epi32(u[2], u[10]);
+ w[3] = _mm_add_epi32(u[3], u[11]);
+ w[4] = _mm_add_epi32(u[4], u[12]);
+ w[5] = _mm_add_epi32(u[5], u[13]);
+ w[6] = _mm_add_epi32(u[6], u[14]);
+ w[7] = _mm_add_epi32(u[7], u[15]);
+ w[8] = _mm_sub_epi32(u[0], u[8]);
+ w[9] = _mm_sub_epi32(u[1], u[9]);
+ w[10] = _mm_sub_epi32(u[2], u[10]);
+ w[11] = _mm_sub_epi32(u[3], u[11]);
+ w[12] = _mm_sub_epi32(u[4], u[12]);
+ w[13] = _mm_sub_epi32(u[5], u[13]);
+ w[14] = _mm_sub_epi32(u[6], u[14]);
+ w[15] = _mm_sub_epi32(u[7], u[15]);
+
+ // shift and rounding
+ u[0] = dct_const_round_shift_sse2(w[0]);
+ u[1] = dct_const_round_shift_sse2(w[1]);
+ u[2] = dct_const_round_shift_sse2(w[2]);
+ u[3] = dct_const_round_shift_sse2(w[3]);
+ u[4] = dct_const_round_shift_sse2(w[4]);
+ u[5] = dct_const_round_shift_sse2(w[5]);
+ u[6] = dct_const_round_shift_sse2(w[6]);
+ u[7] = dct_const_round_shift_sse2(w[7]);
+ u[8] = dct_const_round_shift_sse2(w[8]);
+ u[9] = dct_const_round_shift_sse2(w[9]);
+ u[10] = dct_const_round_shift_sse2(w[10]);
+ u[11] = dct_const_round_shift_sse2(w[11]);
+ u[12] = dct_const_round_shift_sse2(w[12]);
+ u[13] = dct_const_round_shift_sse2(w[13]);
+ u[14] = dct_const_round_shift_sse2(w[14]);
+ u[15] = dct_const_round_shift_sse2(w[15]);
+
+ // back to 16-bit and pack 8 integers into __m128i
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+ in[2] = _mm_packs_epi32(u[4], u[5]);
+ in[3] = _mm_packs_epi32(u[6], u[7]);
+ in[4] = _mm_packs_epi32(u[8], u[9]);
+ in[5] = _mm_packs_epi32(u[10], u[11]);
+ in[6] = _mm_packs_epi32(u[12], u[13]);
+ in[7] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 2
+ s[0] = _mm_add_epi16(in[0], in[2]);
+ s[1] = _mm_add_epi16(in[1], in[3]);
+ s[2] = _mm_sub_epi16(in[0], in[2]);
+ s[3] = _mm_sub_epi16(in[1], in[3]);
+ u[0] = _mm_unpacklo_epi16(in[4], in[5]);
+ u[1] = _mm_unpackhi_epi16(in[4], in[5]);
+ u[2] = _mm_unpacklo_epi16(in[6], in[7]);
+ u[3] = _mm_unpackhi_epi16(in[6], in[7]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+
+ w[0] = _mm_add_epi32(v[0], v[4]);
+ w[1] = _mm_add_epi32(v[1], v[5]);
+ w[2] = _mm_add_epi32(v[2], v[6]);
+ w[3] = _mm_add_epi32(v[3], v[7]);
+ w[4] = _mm_sub_epi32(v[0], v[4]);
+ w[5] = _mm_sub_epi32(v[1], v[5]);
+ w[6] = _mm_sub_epi32(v[2], v[6]);
+ w[7] = _mm_sub_epi32(v[3], v[7]);
+
+ u[0] = dct_const_round_shift_sse2(w[0]);
+ u[1] = dct_const_round_shift_sse2(w[1]);
+ u[2] = dct_const_round_shift_sse2(w[2]);
+ u[3] = dct_const_round_shift_sse2(w[3]);
+ u[4] = dct_const_round_shift_sse2(w[4]);
+ u[5] = dct_const_round_shift_sse2(w[5]);
+ u[6] = dct_const_round_shift_sse2(w[6]);
+ u[7] = dct_const_round_shift_sse2(w[7]);
+
+ // back to 16-bit intergers
+ s[4] = _mm_packs_epi32(u[0], u[1]);
+ s[5] = _mm_packs_epi32(u[2], u[3]);
+ s[6] = _mm_packs_epi32(u[4], u[5]);
+ s[7] = _mm_packs_epi32(u[6], u[7]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+
+ s[2] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
+ s[3] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+ s[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+ s[7] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_m16);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[4]);
+ in[2] = s[6];
+ in[3] = _mm_sub_epi16(kZero, s[2]);
+ in[4] = s[3];
+ in[5] = _mm_sub_epi16(kZero, s[7]);
+ in[6] = s[5];
+ in[7] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static INLINE void idct16_load8x8(const tran_low_t *const input,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * 16);
+ in[1] = load_input_data8(input + 1 * 16);
+ in[2] = load_input_data8(input + 2 * 16);
+ in[3] = load_input_data8(input + 3 * 16);
+ in[4] = load_input_data8(input + 4 * 16);
+ in[5] = load_input_data8(input + 5 * 16);
+ in[6] = load_input_data8(input + 6 * 16);
+ in[7] = load_input_data8(input + 7 * 16);
+}
+
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i l[16], r[16], out[16], *in;
+ int i;
+
+ in = l;
+ for (i = 0; i < 2; i++) {
+ idct16_load8x8(input, in);
+ transpose_16bit_8x8(in, in);
+ idct16_load8x8(input + 8, in + 8);
+ transpose_16bit_8x8(in + 8, in + 8);
+ idct16_8col(in, in);
+ in = r;
+ input += 128;
+ }
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ idct16_8col(out, out);
+
+ for (j = 0; j < 16; ++j) {
+ write_buffer_8x1(dest + j * stride, out[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i in[16], temp[16], out[16];
+ int i;
+
+ idct16_load8x8(input, in);
+ transpose_16bit_8x8(in, in);
+
+ for (i = 8; i < 16; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+ idct16_8col(in, temp);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(temp + i, in);
+ idct16_8col(in, out);
+
+ for (j = 0; j < 16; ++j) {
+ write_buffer_8x1(dest + j * stride, out[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i in[16], l[16];
+ int i;
+
+ // First 1-D inverse DCT
+ // Load input data.
+ in[0] = load_input_data4(input + 0 * 16);
+ in[1] = load_input_data4(input + 1 * 16);
+ in[2] = load_input_data4(input + 2 * 16);
+ in[3] = load_input_data4(input + 3 * 16);
+
+ idct16x16_10_pass1(in, l);
+
+ // Second 1-D inverse transform, performed per 8x16 block
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ idct16x16_10_pass2(l + i, in);
+
+ for (j = 0; j < 16; ++j) {
+ write_buffer_8x1(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d0, d1;
+
+ d0 = _mm_load_si128((__m128i *)(dest));
+ d1 = _mm_unpackhi_epi8(d0, zero);
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d0 = _mm_add_epi16(in_x, d0);
+ d1 = _mm_add_epi16(in_x, d1);
+ d0 = _mm_packus_epi16(d0, d1);
+ _mm_store_si128((__m128i *)(dest), d0);
+}
+
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ int i;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ dc_value = _mm_set1_epi16((int16_t)a1);
+
+ for (i = 0; i < 16; ++i) {
+ recon_and_store_16(dest, dc_value);
+ dest += stride;
+ }
+}
+
+void vpx_iadst16_8col_sse2(__m128i *const in) {
+ // perform 16x16 1-D ADST for 8 columns
+ __m128i s[16], x[16], u[32], v[32];
+ const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i kZero = _mm_setzero_si128();
+
+ u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+ u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+ u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+ u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+ u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+ u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+ u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+ u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+ u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+ u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+ u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+ u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+ u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+ u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+ u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+ u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+ v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+ v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+ v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+ v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+ v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+ v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+ v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+ v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+ v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+ v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+ v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+ v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+ v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+ v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+ v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+ v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+ u[0] = _mm_add_epi32(v[0], v[16]);
+ u[1] = _mm_add_epi32(v[1], v[17]);
+ u[2] = _mm_add_epi32(v[2], v[18]);
+ u[3] = _mm_add_epi32(v[3], v[19]);
+ u[4] = _mm_add_epi32(v[4], v[20]);
+ u[5] = _mm_add_epi32(v[5], v[21]);
+ u[6] = _mm_add_epi32(v[6], v[22]);
+ u[7] = _mm_add_epi32(v[7], v[23]);
+ u[8] = _mm_add_epi32(v[8], v[24]);
+ u[9] = _mm_add_epi32(v[9], v[25]);
+ u[10] = _mm_add_epi32(v[10], v[26]);
+ u[11] = _mm_add_epi32(v[11], v[27]);
+ u[12] = _mm_add_epi32(v[12], v[28]);
+ u[13] = _mm_add_epi32(v[13], v[29]);
+ u[14] = _mm_add_epi32(v[14], v[30]);
+ u[15] = _mm_add_epi32(v[15], v[31]);
+ u[16] = _mm_sub_epi32(v[0], v[16]);
+ u[17] = _mm_sub_epi32(v[1], v[17]);
+ u[18] = _mm_sub_epi32(v[2], v[18]);
+ u[19] = _mm_sub_epi32(v[3], v[19]);
+ u[20] = _mm_sub_epi32(v[4], v[20]);
+ u[21] = _mm_sub_epi32(v[5], v[21]);
+ u[22] = _mm_sub_epi32(v[6], v[22]);
+ u[23] = _mm_sub_epi32(v[7], v[23]);
+ u[24] = _mm_sub_epi32(v[8], v[24]);
+ u[25] = _mm_sub_epi32(v[9], v[25]);
+ u[26] = _mm_sub_epi32(v[10], v[26]);
+ u[27] = _mm_sub_epi32(v[11], v[27]);
+ u[28] = _mm_sub_epi32(v[12], v[28]);
+ u[29] = _mm_sub_epi32(v[13], v[29]);
+ u[30] = _mm_sub_epi32(v[14], v[30]);
+ u[31] = _mm_sub_epi32(v[15], v[31]);
+
+ u[0] = dct_const_round_shift_sse2(u[0]);
+ u[1] = dct_const_round_shift_sse2(u[1]);
+ u[2] = dct_const_round_shift_sse2(u[2]);
+ u[3] = dct_const_round_shift_sse2(u[3]);
+ u[4] = dct_const_round_shift_sse2(u[4]);
+ u[5] = dct_const_round_shift_sse2(u[5]);
+ u[6] = dct_const_round_shift_sse2(u[6]);
+ u[7] = dct_const_round_shift_sse2(u[7]);
+ u[8] = dct_const_round_shift_sse2(u[8]);
+ u[9] = dct_const_round_shift_sse2(u[9]);
+ u[10] = dct_const_round_shift_sse2(u[10]);
+ u[11] = dct_const_round_shift_sse2(u[11]);
+ u[12] = dct_const_round_shift_sse2(u[12]);
+ u[13] = dct_const_round_shift_sse2(u[13]);
+ u[14] = dct_const_round_shift_sse2(u[14]);
+ u[15] = dct_const_round_shift_sse2(u[15]);
+ u[16] = dct_const_round_shift_sse2(u[16]);
+ u[17] = dct_const_round_shift_sse2(u[17]);
+ u[18] = dct_const_round_shift_sse2(u[18]);
+ u[19] = dct_const_round_shift_sse2(u[19]);
+ u[20] = dct_const_round_shift_sse2(u[20]);
+ u[21] = dct_const_round_shift_sse2(u[21]);
+ u[22] = dct_const_round_shift_sse2(u[22]);
+ u[23] = dct_const_round_shift_sse2(u[23]);
+ u[24] = dct_const_round_shift_sse2(u[24]);
+ u[25] = dct_const_round_shift_sse2(u[25]);
+ u[26] = dct_const_round_shift_sse2(u[26]);
+ u[27] = dct_const_round_shift_sse2(u[27]);
+ u[28] = dct_const_round_shift_sse2(u[28]);
+ u[29] = dct_const_round_shift_sse2(u[29]);
+ u[30] = dct_const_round_shift_sse2(u[30]);
+ u[31] = dct_const_round_shift_sse2(u[31]);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_packs_epi32(u[8], u[9]);
+ s[5] = _mm_packs_epi32(u[10], u[11]);
+ s[6] = _mm_packs_epi32(u[12], u[13]);
+ s[7] = _mm_packs_epi32(u[14], u[15]);
+ s[8] = _mm_packs_epi32(u[16], u[17]);
+ s[9] = _mm_packs_epi32(u[18], u[19]);
+ s[10] = _mm_packs_epi32(u[20], u[21]);
+ s[11] = _mm_packs_epi32(u[22], u[23]);
+ s[12] = _mm_packs_epi32(u[24], u[25]);
+ s[13] = _mm_packs_epi32(u[26], u[27]);
+ s[14] = _mm_packs_epi32(u[28], u[29]);
+ s[15] = _mm_packs_epi32(u[30], u[31]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+ u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ u[0] = dct_const_round_shift_sse2(u[0]);
+ u[1] = dct_const_round_shift_sse2(u[1]);
+ u[2] = dct_const_round_shift_sse2(u[2]);
+ u[3] = dct_const_round_shift_sse2(u[3]);
+ u[4] = dct_const_round_shift_sse2(u[4]);
+ u[5] = dct_const_round_shift_sse2(u[5]);
+ u[6] = dct_const_round_shift_sse2(u[6]);
+ u[7] = dct_const_round_shift_sse2(u[7]);
+ u[8] = dct_const_round_shift_sse2(u[8]);
+ u[9] = dct_const_round_shift_sse2(u[9]);
+ u[10] = dct_const_round_shift_sse2(u[10]);
+ u[11] = dct_const_round_shift_sse2(u[11]);
+ u[12] = dct_const_round_shift_sse2(u[12]);
+ u[13] = dct_const_round_shift_sse2(u[13]);
+ u[14] = dct_const_round_shift_sse2(u[14]);
+ u[15] = dct_const_round_shift_sse2(u[15]);
+
+ x[0] = _mm_add_epi16(s[0], s[4]);
+ x[1] = _mm_add_epi16(s[1], s[5]);
+ x[2] = _mm_add_epi16(s[2], s[6]);
+ x[3] = _mm_add_epi16(s[3], s[7]);
+ x[4] = _mm_sub_epi16(s[0], s[4]);
+ x[5] = _mm_sub_epi16(s[1], s[5]);
+ x[6] = _mm_sub_epi16(s[2], s[6]);
+ x[7] = _mm_sub_epi16(s[3], s[7]);
+ x[8] = _mm_packs_epi32(u[0], u[1]);
+ x[9] = _mm_packs_epi32(u[2], u[3]);
+ x[10] = _mm_packs_epi32(u[4], u[5]);
+ x[11] = _mm_packs_epi32(u[6], u[7]);
+ x[12] = _mm_packs_epi32(u[8], u[9]);
+ x[13] = _mm_packs_epi32(u[10], u[11]);
+ x[14] = _mm_packs_epi32(u[12], u[13]);
+ x[15] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+ u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+ u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+ u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+ u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+ u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+ u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+ u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ v[0] = dct_const_round_shift_sse2(u[0]);
+ v[1] = dct_const_round_shift_sse2(u[1]);
+ v[2] = dct_const_round_shift_sse2(u[2]);
+ v[3] = dct_const_round_shift_sse2(u[3]);
+ v[4] = dct_const_round_shift_sse2(u[4]);
+ v[5] = dct_const_round_shift_sse2(u[5]);
+ v[6] = dct_const_round_shift_sse2(u[6]);
+ v[7] = dct_const_round_shift_sse2(u[7]);
+ v[8] = dct_const_round_shift_sse2(u[8]);
+ v[9] = dct_const_round_shift_sse2(u[9]);
+ v[10] = dct_const_round_shift_sse2(u[10]);
+ v[11] = dct_const_round_shift_sse2(u[11]);
+ v[12] = dct_const_round_shift_sse2(u[12]);
+ v[13] = dct_const_round_shift_sse2(u[13]);
+ v[14] = dct_const_round_shift_sse2(u[14]);
+ v[15] = dct_const_round_shift_sse2(u[15]);
+
+ s[0] = _mm_add_epi16(x[0], x[2]);
+ s[1] = _mm_add_epi16(x[1], x[3]);
+ s[2] = _mm_sub_epi16(x[0], x[2]);
+ s[3] = _mm_sub_epi16(x[1], x[3]);
+ s[4] = _mm_packs_epi32(v[0], v[1]);
+ s[5] = _mm_packs_epi32(v[2], v[3]);
+ s[6] = _mm_packs_epi32(v[4], v[5]);
+ s[7] = _mm_packs_epi32(v[6], v[7]);
+ s[8] = _mm_add_epi16(x[8], x[10]);
+ s[9] = _mm_add_epi16(x[9], x[11]);
+ s[10] = _mm_sub_epi16(x[8], x[10]);
+ s[11] = _mm_sub_epi16(x[9], x[11]);
+ s[12] = _mm_packs_epi32(v[8], v[9]);
+ s[13] = _mm_packs_epi32(v[10], v[11]);
+ s[14] = _mm_packs_epi32(v[12], v[13]);
+ s[15] = _mm_packs_epi32(v[14], v[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16);
+ in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+ in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+ in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
+ in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16);
+ in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16);
+ in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16);
+ in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[8]);
+ in[2] = s[12];
+ in[3] = _mm_sub_epi16(kZero, s[4]);
+ in[12] = s[5];
+ in[13] = _mm_sub_epi16(kZero, s[13]);
+ in[14] = s[9];
+ in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void idct16_sse2(__m128i *const in0, __m128i *const in1) {
+ transpose_16bit_16x16(in0, in1);
+ idct16_8col(in0, in0);
+ idct16_8col(in1, in1);
+}
+
+void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
+ transpose_16bit_16x16(in0, in1);
+ vpx_iadst16_8col_sse2(in0);
+ vpx_iadst16_8col_sse2(in1);
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[8]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+ // stage 4
+ step2[0] = butterfly_cospi16(in[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[0];
+ step1[2] = step2[0];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[16]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_34_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_34_8x32_quarter_1(in, temp);
+ idct32_34_8x32_quarter_2(in, temp);
+ // stage 7
+ add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i step1[32];
+
+ // stage 1
+ butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+ butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+ butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+ butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+ // stage 3
+ butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
+ __m128i temp[32];
+
+ idct32_34_8x32_quarter_1_2(in, temp);
+ idct32_34_8x32_quarter_3_4(in, temp);
+ // final stage
+ add_sub_butterfly(temp, out, 32);
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i io[32], col[32];
+ int i;
+
+ // Load input data. Only need to load the top left 8x8 block.
+ load_transpose_16bit_8x8(input, 32, io);
+ idct32_34_8x32_sse2(io, col);
+
+ for (i = 0; i < 32; i += 8) {
+ int j;
+ transpose_16bit_8x8(col + i, io);
+ idct32_34_8x32_sse2(io, io);
+
+ for (j = 0; j < 32; ++j) {
+ write_buffer_8x1(dest + j * stride, io[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+ // stage 4
+ butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm_add_epi16(step2[11], step2[10]);
+ step1[12] = _mm_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_1024_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_1024_8x32_quarter_1(in, temp);
+ idct32_1024_8x32_quarter_2(in, temp);
+ // stage 7
+ add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+ butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+ butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+ butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+
+ butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+ butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
+
+ butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+ butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi16(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+ step2[19] = _mm_add_epi16(step1[19], step1[18]);
+ step2[20] = _mm_add_epi16(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+ step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi16(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+ step2[27] = _mm_add_epi16(step1[27], step1[26]);
+ step2[28] = _mm_add_epi16(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+ step2[31] = _mm_add_epi16(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_1024_8x32(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
+ __m128i temp[32];
+
+ idct32_1024_8x32_quarter_1_2(in, temp);
+ idct32_1024_8x32_quarter_3_4(in, temp);
+ // final stage
+ add_sub_butterfly(temp, out, 32);
+}
+
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[4][32], io[32];
+ int i;
+
+ // rows
+ for (i = 0; i < 4; i++) {
+ load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+ load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+ load_transpose_16bit_8x8(&input[16], 32, &io[16]);
+ load_transpose_16bit_8x8(&input[24], 32, &io[24]);
+ idct32_1024_8x32(io, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ // Transpose 32x8 block to 8x32 block
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ transpose_16bit_8x8(col[2] + i, io + 16);
+ transpose_16bit_8x8(col[3] + i, io + 24);
+
+ idct32_1024_8x32(io, io);
+ store_buffer_8x32(io, dest, stride);
+ dest += 8;
+ }
+}
+
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[2][32], in[32], out[32];
+ int i;
+
+ for (i = 16; i < 32; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+
+ // rows
+ for (i = 0; i < 2; i++) {
+ load_transpose_16bit_8x8(&input[0], 32, &in[0]);
+ load_transpose_16bit_8x8(&input[8], 32, &in[8]);
+ idct32_1024_8x32(in, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, in);
+ transpose_16bit_8x8(col[1] + i, in + 8);
+ idct32_1024_8x32(in, out);
+ store_buffer_8x32(out, dest, stride);
+ dest += 8;
+ }
+}
+
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ int j;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ dc_value = _mm_set1_epi16((int16_t)a1);
+
+ for (j = 0; j < 32; ++j) {
+ recon_and_store_16(dest + j * stride + 0, dc_value);
+ recon_and_store_16(dest + j * stride + 16, dc_value);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
new file mode 100644
index 0000000000..b4bbd186d2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
@@ -0,0 +1,710 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 30 31 32 33 00 01 02 03
+ // in[1]: 20 21 22 23 10 11 12 13
+ // in[2]: 40 41 42 43 70 71 72 73
+ // in[3]: 50 51 52 53 60 61 62 63
+ // to:
+ // tr0_0: 00 10 01 11 02 12 03 13
+ // tr0_1: 20 30 21 31 22 32 23 33
+ // tr0_2: 40 50 41 51 42 52 43 53
+ // tr0_3: 60 70 61 71 62 72 63 73
+ const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]);
+ const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]);
+
+ // Unpack 32 bit elements resulting in:
+ // tr1_0: 00 10 20 30 01 11 21 31
+ // tr1_1: 02 12 22 32 03 13 23 33
+ // tr1_2: 40 50 60 70 41 51 61 71
+ // tr1_3: 42 52 62 72 43 53 63 73
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+}
+
+static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) {
+ const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING));
+ return _mm_srai_epi32(t, DCT_CONST_BITS);
+}
+
+static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in,
+ const __m128i cospi) {
+ const __m128i t = _mm_madd_epi16(in, cospi);
+ return dct_const_round_shift_sse2(t);
+}
+
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0,
+ const __m128i in1,
+ const __m128i x) {
+ const __m128i t0 = idct_madd_round_shift_sse2(in0, x);
+ const __m128i t1 = idct_madd_round_shift_sse2(in1, x);
+ return _mm_packs_epi32(t0, t1);
+}
+
+// Multiply elements by constants and add them together.
+static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0,
+ const int c1, __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i cst0 = pair_set_epi16(c0, -c1);
+ const __m128i cst1 = pair_set_epi16(c1, c0);
+ const __m128i lo = _mm_unpacklo_epi16(in0, in1);
+ const __m128i hi = _mm_unpackhi_epi16(in0, in1);
+ *out0 = idct_calc_wraplow_sse2(lo, hi, cst0);
+ *out1 = idct_calc_wraplow_sse2(lo, hi, cst1);
+}
+
+static INLINE __m128i butterfly_cospi16(const __m128i in) {
+ const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128());
+ const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128());
+ return idct_calc_wraplow_sse2(lo, hi, cst);
+}
+
+// Functions to allow 8 bit optimisations to be used when profile 0 is used with
+// highbitdepth enabled
+static INLINE __m128i load_input_data4(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i in = _mm_load_si128((const __m128i *)data);
+ return _mm_packs_epi32(in, zero);
+#else
+ return _mm_loadl_epi64((const __m128i *)data);
+#endif
+}
+
+static INLINE __m128i load_input_data8(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i in0 = _mm_load_si128((const __m128i *)data);
+ const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4));
+ return _mm_packs_epi32(in0, in1);
+#else
+ return _mm_load_si128((const __m128i *)data);
+#endif
+}
+
+static INLINE void load_transpose_16bit_8x8(const tran_low_t *input,
+ const int stride,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * stride);
+ in[1] = load_input_data8(input + 1 * stride);
+ in[2] = load_input_data8(input + 2 * stride);
+ in[3] = load_input_data8(input + 3 * stride);
+ in[4] = load_input_data8(input + 4 * stride);
+ in[5] = load_input_data8(input + 5 * stride);
+ in[6] = load_input_data8(input + 6 * stride);
+ in[7] = load_input_data8(input + 7 * stride);
+ transpose_16bit_8x8(in, in);
+}
+
+static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest));
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d0 = _mm_add_epi16(in_x, d0);
+ d0 = _mm_packus_epi16(d0, d0);
+ _mm_storel_epi64((__m128i *)(dest), d0);
+}
+
+static INLINE void round_shift_8x8(const __m128i *const in,
+ __m128i *const out) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+ out[0] = _mm_add_epi16(in[0], final_rounding);
+ out[1] = _mm_add_epi16(in[1], final_rounding);
+ out[2] = _mm_add_epi16(in[2], final_rounding);
+ out[3] = _mm_add_epi16(in[3], final_rounding);
+ out[4] = _mm_add_epi16(in[4], final_rounding);
+ out[5] = _mm_add_epi16(in[5], final_rounding);
+ out[6] = _mm_add_epi16(in[6], final_rounding);
+ out[7] = _mm_add_epi16(in[7], final_rounding);
+
+ out[0] = _mm_srai_epi16(out[0], 5);
+ out[1] = _mm_srai_epi16(out[1], 5);
+ out[2] = _mm_srai_epi16(out[2], 5);
+ out[3] = _mm_srai_epi16(out[3], 5);
+ out[4] = _mm_srai_epi16(out[4], 5);
+ out[5] = _mm_srai_epi16(out[5], 5);
+ out[6] = _mm_srai_epi16(out[6], 5);
+ out[7] = _mm_srai_epi16(out[7], 5);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *const in,
+ uint8_t *const dest, const int stride) {
+ __m128i t[8];
+
+ round_shift_8x8(in, t);
+
+ recon_and_store(dest + 0 * stride, t[0]);
+ recon_and_store(dest + 1 * stride, t[1]);
+ recon_and_store(dest + 2 * stride, t[2]);
+ recon_and_store(dest + 3 * stride, t[3]);
+ recon_and_store(dest + 4 * stride, t[4]);
+ recon_and_store(dest + 5 * stride, t[5]);
+ recon_and_store(dest + 6 * stride, t[6]);
+ recon_and_store(dest + 7 * stride, t[7]);
+}
+
+static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
+ uint8_t *const dest,
+ const int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d[2];
+
+ // Reconstruction and Store
+ d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+ d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+ d[0] = _mm_unpacklo_epi32(d[0],
+ _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+ d[1] = _mm_unpacklo_epi32(
+ _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+ d[0] = _mm_unpacklo_epi8(d[0], zero);
+ d[1] = _mm_unpacklo_epi8(d[1], zero);
+ d[0] = _mm_add_epi16(d[0], in[0]);
+ d[1] = _mm_add_epi16(d[1], in[1]);
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+
+ *(int *)dest = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
+}
+
+static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ int j = 0;
+ while (j < 32) {
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
+
+ in[j] = _mm_srai_epi16(in[j], 6);
+ in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
+
+ recon_and_store(dst, in[j]);
+ dst += stride;
+ recon_and_store(dst, in[j + 1]);
+ dst += stride;
+ j += 2;
+ }
+}
+
+static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ __m128i out;
+ out = _mm_adds_epi16(in, final_rounding);
+ out = _mm_srai_epi16(out, 6);
+ recon_and_store(dest, out);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
+ int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm_add_epi16(in[i], in[bound - i]);
+ out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+static INLINE void idct8(const __m128i *const in /*in[8]*/,
+ __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 1
+ butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+ // stage 2
+ butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+ // stage 4
+ out[0] = _mm_add_epi16(step1[0], step2[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step2[4]);
+ out[4] = _mm_sub_epi16(step1[3], step2[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ __m128i step1[8], step2[8], tmp[4];
+
+ transpose_16bit_4x4(io, io);
+ // io[0]: 00 10 20 30 01 11 21 31
+ // io[1]: 02 12 22 32 03 13 23 33
+
+ // stage 1
+ {
+ const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero);
+ const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero);
+ step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7
+ step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6
+ }
+
+ // stage 2
+ {
+ const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero);
+ const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero);
+ const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0);
+ step2[0] = _mm_packs_epi32(t, t); // step2 0&1
+ step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2
+ step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
+ step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6
+ }
+
+ // stage 3
+ {
+ const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]);
+ tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
+ tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
+ step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
+ step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
+ step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6
+ }
+
+ // stage 4
+ tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
+ tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
+ tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
+ tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
+
+ idct8x8_12_transpose_16bit_4x8(tmp, io);
+ io[4] = io[5] = io[6] = io[7] = zero;
+
+ idct8(io, io);
+}
+
+static INLINE void idct16_8col(const __m128i *const in /*in[16]*/,
+ __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+ step1[8] = _mm_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm_add_epi16(step2[10], step2[11]);
+ step1[12] = _mm_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm_add_epi16(step2[14], step2[15]);
+
+ // stage 4
+ butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+ &step2[10]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step1[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step1[7] = _mm_add_epi16(step1[6], step1[7]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = _mm_add_epi16(step1[0], step1[7]);
+ step2[1] = _mm_add_epi16(step1[1], step1[6]);
+ step2[2] = _mm_add_epi16(step1[2], step1[5]);
+ step2[3] = _mm_add_epi16(step1[3], step1[4]);
+ step2[4] = _mm_sub_epi16(step1[3], step1[4]);
+ step2[5] = _mm_sub_epi16(step1[2], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[1], step1[6]);
+ step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+ butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+ &step2[13]);
+ butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+ &step2[12]);
+
+ // stage 7
+ out[0] = _mm_add_epi16(step2[0], step1[15]);
+ out[1] = _mm_add_epi16(step2[1], step1[14]);
+ out[2] = _mm_add_epi16(step2[2], step2[13]);
+ out[3] = _mm_add_epi16(step2[3], step2[12]);
+ out[4] = _mm_add_epi16(step2[4], step2[11]);
+ out[5] = _mm_add_epi16(step2[5], step2[10]);
+ out[6] = _mm_add_epi16(step2[6], step1[9]);
+ out[7] = _mm_add_epi16(step2[7], step1[8]);
+ out[8] = _mm_sub_epi16(step2[7], step1[8]);
+ out[9] = _mm_sub_epi16(step2[6], step1[9]);
+ out[10] = _mm_sub_epi16(step2[5], step2[10]);
+ out[11] = _mm_sub_epi16(step2[4], step2[11]);
+ out[12] = _mm_sub_epi16(step2[3], step2[12]);
+ out[13] = _mm_sub_epi16(step2[2], step2[13]);
+ out[14] = _mm_sub_epi16(step2[1], step1[14]);
+ out[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/,
+ __m128i *const output /*output[16]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i step1[16], step2[16];
+
+ transpose_16bit_4x4(input, output);
+
+ // stage 2
+ {
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero);
+ const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]);
+ step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30,
+ lo_1_15); // step2 8&15
+ step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06,
+ lo_13_3); // step2 11&12
+ }
+
+ // stage 3
+ {
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero);
+ step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28,
+ lo_2_14); // step1 4&7
+ step1[13] = _mm_unpackhi_epi64(step2[11], zero);
+ step1[14] = _mm_unpackhi_epi64(step2[8], zero);
+ }
+
+ // stage 4
+ {
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero);
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]);
+ const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16);
+ step1[0] = _mm_packs_epi32(t, t); // step2 0&1
+ step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08,
+ lo_9_14); // step2 9&14
+ step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24,
+ lo_10_13); // step2 10&13
+ step2[6] = _mm_unpackhi_epi64(step1[4], zero);
+ }
+
+ // stage 5
+ {
+ const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]);
+ step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16,
+ lo_5_6); // step1 6&5
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_unpackhi_epi64(step1[11], zero);
+ step1[13] = _mm_unpackhi_epi64(step1[10], zero);
+ step1[14] = _mm_unpackhi_epi64(step1[9], zero);
+ step1[15] = _mm_unpackhi_epi64(step1[8], zero);
+ }
+
+ // stage 6
+ {
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]);
+ step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+ lo_10_13); // step2 10&13
+ step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+ lo_11_12); // step2 11&12
+ step2[13] = _mm_unpackhi_epi64(step2[10], zero);
+ step2[12] = _mm_unpackhi_epi64(step2[11], zero);
+ step2[3] = _mm_add_epi16(step1[0], step1[4]);
+ step2[1] = _mm_add_epi16(step1[0], step1[6]);
+ step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+ step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+ step2[0] = _mm_unpackhi_epi64(step2[3], zero);
+ step2[2] = _mm_unpackhi_epi64(step2[1], zero);
+ step2[5] = _mm_unpackhi_epi64(step2[6], zero);
+ step2[7] = _mm_unpackhi_epi64(step2[4], zero);
+ }
+
+ // stage 7. Left 8x16 only.
+ output[0] = _mm_add_epi16(step2[0], step1[15]);
+ output[1] = _mm_add_epi16(step2[1], step1[14]);
+ output[2] = _mm_add_epi16(step2[2], step2[13]);
+ output[3] = _mm_add_epi16(step2[3], step2[12]);
+ output[4] = _mm_add_epi16(step2[4], step2[11]);
+ output[5] = _mm_add_epi16(step2[5], step2[10]);
+ output[6] = _mm_add_epi16(step2[6], step1[9]);
+ output[7] = _mm_add_epi16(step2[7], step1[8]);
+ output[8] = _mm_sub_epi16(step2[7], step1[8]);
+ output[9] = _mm_sub_epi16(step2[6], step1[9]);
+ output[10] = _mm_sub_epi16(step2[5], step2[10]);
+ output[11] = _mm_sub_epi16(step2[4], step2[11]);
+ output[12] = _mm_sub_epi16(step2[3], step2[12]);
+ output[13] = _mm_sub_epi16(step2[2], step2[13]);
+ output[14] = _mm_sub_epi16(step2[1], step1[14]);
+ output[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/,
+ __m128i *const io /*io[16]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i step1[16], step2[16];
+
+ transpose_16bit_4x8(l, io);
+
+ // stage 2
+ butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+ // stage 4
+ step1[0] = butterfly_cospi16(io[0]);
+ butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13],
+ &step2[10]);
+
+ // stage 5
+ butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = _mm_add_epi16(step1[0], step1[7]);
+ step2[1] = _mm_add_epi16(step1[0], step1[6]);
+ step2[2] = _mm_add_epi16(step1[0], step1[5]);
+ step2[3] = _mm_add_epi16(step1[0], step1[4]);
+ step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+ step2[5] = _mm_sub_epi16(step1[0], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+ step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+ butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+ &step2[13]);
+ butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+ &step2[12]);
+
+ // stage 7
+ io[0] = _mm_add_epi16(step2[0], step1[15]);
+ io[1] = _mm_add_epi16(step2[1], step1[14]);
+ io[2] = _mm_add_epi16(step2[2], step2[13]);
+ io[3] = _mm_add_epi16(step2[3], step2[12]);
+ io[4] = _mm_add_epi16(step2[4], step2[11]);
+ io[5] = _mm_add_epi16(step2[5], step2[10]);
+ io[6] = _mm_add_epi16(step2[6], step1[9]);
+ io[7] = _mm_add_epi16(step2[7], step1[8]);
+ io[8] = _mm_sub_epi16(step2[7], step1[8]);
+ io[9] = _mm_sub_epi16(step2[6], step1[9]);
+ io[10] = _mm_sub_epi16(step2[5], step2[10]);
+ io[11] = _mm_sub_epi16(step2[4], step2[11]);
+ io[12] = _mm_sub_epi16(step2[3], step2[12]);
+ io[13] = _mm_sub_epi16(step2[2], step2[13]);
+ io[14] = _mm_sub_epi16(step2[1], step1[14]);
+ io[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct32_8x32_quarter_2_stage_4_to_6(
+ __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+ &step2[13]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]);
+ butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7(
+ __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[16] = _mm_add_epi16(step1[16], step1[19]);
+ step2[17] = _mm_add_epi16(step1[17], step1[18]);
+ step2[18] = _mm_sub_epi16(step1[17], step1[18]);
+ step2[19] = _mm_sub_epi16(step1[16], step1[19]);
+ step2[20] = _mm_sub_epi16(step1[23], step1[20]);
+ step2[21] = _mm_sub_epi16(step1[22], step1[21]);
+ step2[22] = _mm_add_epi16(step1[22], step1[21]);
+ step2[23] = _mm_add_epi16(step1[23], step1[20]);
+
+ step2[24] = _mm_add_epi16(step1[24], step1[27]);
+ step2[25] = _mm_add_epi16(step1[25], step1[26]);
+ step2[26] = _mm_sub_epi16(step1[25], step1[26]);
+ step2[27] = _mm_sub_epi16(step1[24], step1[27]);
+ step2[28] = _mm_sub_epi16(step1[31], step1[28]);
+ step2[29] = _mm_sub_epi16(step1[30], step1[29]);
+ step2[30] = _mm_add_epi16(step1[29], step1[30]);
+ step2[31] = _mm_add_epi16(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+ &step1[29]);
+ butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+ &step1[28]);
+ butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+ &step1[27]);
+ butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+ &step1[26]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ out[16] = _mm_add_epi16(step1[16], step1[23]);
+ out[17] = _mm_add_epi16(step1[17], step1[22]);
+ out[18] = _mm_add_epi16(step1[18], step1[21]);
+ out[19] = _mm_add_epi16(step1[19], step1[20]);
+ step2[20] = _mm_sub_epi16(step1[19], step1[20]);
+ step2[21] = _mm_sub_epi16(step1[18], step1[21]);
+ step2[22] = _mm_sub_epi16(step1[17], step1[22]);
+ step2[23] = _mm_sub_epi16(step1[16], step1[23]);
+
+ step2[24] = _mm_sub_epi16(step1[31], step1[24]);
+ step2[25] = _mm_sub_epi16(step1[30], step1[25]);
+ step2[26] = _mm_sub_epi16(step1[29], step1[26]);
+ step2[27] = _mm_sub_epi16(step1[28], step1[27]);
+ out[28] = _mm_add_epi16(step1[27], step1[28]);
+ out[29] = _mm_add_epi16(step1[26], step1[29]);
+ out[30] = _mm_add_epi16(step1[25], step1[30]);
+ out[31] = _mm_add_epi16(step1[24], step1[31]);
+
+ // stage 7
+ butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]);
+ butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]);
+ butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]);
+ butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]);
+}
+
+void idct4_sse2(__m128i *const in);
+void vpx_idct8_sse2(__m128i *const in);
+void idct16_sse2(__m128i *const in0, __m128i *const in1);
+void iadst4_sse2(__m128i *const in);
+void iadst8_sse2(__m128i *const in);
+void vpx_iadst16_8col_sse2(__m128i *const in);
+void iadst16_sse2(__m128i *const in0, __m128i *const in1);
+void idct32_1024_8x32(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out);
+
+#endif // VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
new file mode 100644
index 0000000000..6e99469b63
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void partial_butterfly_ssse3(const __m128i in, const int c0,
+ const int c1, __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i cst0 = _mm_set1_epi16(2 * c0);
+ const __m128i cst1 = _mm_set1_epi16(2 * c1);
+ *out0 = _mm_mulhrs_epi16(in, cst0);
+ *out1 = _mm_mulhrs_epi16(in, cst1);
+}
+
+static INLINE __m128i partial_butterfly_cospi16_ssse3(const __m128i in) {
+ const __m128i coef_pair = _mm_set1_epi16(2 * cospi_16_64);
+ return _mm_mulhrs_epi16(in, coef_pair);
+}
+
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i io[8];
+
+ io[0] = load_input_data4(input + 0 * 8);
+ io[1] = load_input_data4(input + 1 * 8);
+ io[2] = load_input_data4(input + 2 * 8);
+ io[3] = load_input_data4(input + 3 * 8);
+
+ idct8x8_12_add_kernel_ssse3(io);
+ write_buffer_8x8(io, dest, stride);
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+ // stage 4
+ step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[0];
+ step1[2] = step2[0];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_34_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_34_8x32_quarter_1(in, temp);
+ idct32_34_8x32_quarter_2(in, temp);
+ // stage 7
+ add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32];
+
+ // stage 1
+ partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+ partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 3
+ butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_34_8x32_ssse3(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
+ __m128i temp[32];
+
+ idct32_34_8x32_quarter_1_2(in, temp);
+ idct32_34_8x32_quarter_3_4(in, temp);
+ // final stage
+ add_sub_butterfly(temp, out, 32);
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i io[32], col[32];
+ int i;
+
+ // Load input data. Only need to load the top left 8x8 block.
+ load_transpose_16bit_8x8(input, 32, io);
+ idct32_34_8x32_ssse3(io, col);
+
+ for (i = 0; i < 32; i += 8) {
+ int j;
+ transpose_16bit_8x8(col + i, io);
+ idct32_34_8x32_ssse3(io, io);
+
+ for (j = 0; j < 32; ++j) {
+ write_buffer_8x1(dest + j * stride, io[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ partial_butterfly_ssse3(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+ partial_butterfly_ssse3(in[8], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ partial_butterfly_ssse3(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+ &step2[14]);
+ partial_butterfly_ssse3(in[10], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm_add_epi16(step2[11], step2[10]);
+ step1[12] = _mm_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_135_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_135_8x32_quarter_1(in, temp);
+ idct32_135_8x32_quarter_2(in, temp);
+ // stage 7
+ add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ partial_butterfly_ssse3(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+ &step1[30]);
+ partial_butterfly_ssse3(in[9], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+
+ partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ partial_butterfly_ssse3(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+ &step1[26]);
+
+ partial_butterfly_ssse3(in[13], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi16(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+ step2[19] = _mm_add_epi16(step1[19], step1[18]);
+ step2[20] = _mm_add_epi16(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+ step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi16(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+ step2[27] = _mm_add_epi16(step1[27], step1[26]);
+ step2[28] = _mm_add_epi16(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+ step2[31] = _mm_add_epi16(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_135_8x32_ssse3(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
+ __m128i temp[32];
+ idct32_135_8x32_quarter_1_2(in, temp);
+ idct32_135_8x32_quarter_3_4(in, temp);
+ // final stage
+ add_sub_butterfly(temp, out, 32);
+}
+
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[2][32], io[32];
+ int i;
+
+ // rows
+ for (i = 0; i < 2; i++) {
+ load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+ load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+ idct32_135_8x32_ssse3(io, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ idct32_135_8x32_ssse3(io, io);
+ store_buffer_8x32(io, dest, stride);
+ dest += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
new file mode 100644
index 0000000000..e9f0f69033
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
+#define VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
+ const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
+ const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
+ const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
+ const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64));
+ const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64));
+ const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64));
+ const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64));
+ const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64));
+ const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64));
+ const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64));
+ __m128i step1[8], step2[8], tmp[4];
+
+ // pass 1
+
+ transpose_16bit_4x4(io, io);
+ // io[0]: 00 10 20 30 01 11 21 31
+ // io[1]: 02 12 22 32 03 13 23 33
+
+ // stage 1
+ tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
+ tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
+ tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
+ tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
+ step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7
+ step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6
+
+ // stage 2
+ step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1
+ step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2
+ step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
+ step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6
+
+ // stage 3
+ tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
+ step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6
+ tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
+ tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
+ step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
+ step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
+
+ // stage 4
+ tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
+ tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
+ tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
+ tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
+
+ // pass 2
+
+ idct8x8_12_transpose_16bit_4x8(tmp, io);
+
+ // stage 1
+ step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
+ step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
+ step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
+ step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
+
+ // stage 2
+ step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0]
+ step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
+ step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+ // stage 4
+ io[0] = _mm_add_epi16(step1[0], step2[7]);
+ io[1] = _mm_add_epi16(step1[1], step1[6]);
+ io[2] = _mm_add_epi16(step1[2], step1[5]);
+ io[3] = _mm_add_epi16(step1[3], step2[4]);
+ io[4] = _mm_sub_epi16(step1[3], step2[4]);
+ io[5] = _mm_sub_epi16(step1[2], step1[5]);
+ io[6] = _mm_sub_epi16(step1[1], step1[6]);
+ io[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out);
+
+#endif // VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
new file mode 100644
index 0000000000..bcf1a6ef98
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
@@ -0,0 +1,103 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+ ; a c d b to a b c d
+ SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+ ; input:
+ ; m0 a
+ ; m1 b
+ ; m2 c
+ ; m3 d
+ paddw m0, m2
+ psubw m3, m1
+
+ ; wide subtract
+ punpcklwd m4, m0
+ punpcklwd m5, m3
+ psrad m4, 16
+ psrad m5, 16
+ psubd m4, m5
+ psrad m4, 1
+ packssdw m4, m4 ; e
+
+ psubw m5, m4, m1 ; b
+ psubw m4, m2 ; c
+ psubw m0, m5
+ paddw m3, m4
+ ; m0 a
+ SWAP 1, 5 ; m1 b
+ SWAP 2, 4 ; m2 c
+ ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ mova m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+ mova m3, m0
+ punpcklwd m0, m1
+ punpckhwd m3, m1
+ mova m2, m0
+ punpcklwd m0, m3
+ punpckhwd m2, m3
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero
+ movd m%3, [outputq]
+ movd m%4, [outputq + strideq]
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+ paddw m%1, m%3
+ paddw m%2, m%4
+ packuswb m%1, m%5
+ packuswb m%2, m%5
+ movd [outputq], m%1
+ movd [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+ LOAD_TRAN_LOW 0, inputq, 0
+ LOAD_TRAN_LOW 1, inputq, 8
+ psraw m0, 2
+ psraw m1, 2
+
+ TRANSPOSE_4X4_WIDE
+ REORDER_INPUTS
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ REORDER_INPUTS
+ TRANSFORM_COLS
+
+ pxor m4, m4
+ ADD_STORE_4P_2X 0, 1, 5, 6, 4
+ lea outputq, [outputq + 2 * strideq]
+ ADD_STORE_4P_2X 2, 3, 5, 6, 4
+
+ RET
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c
new file mode 100644
index 0000000000..a58fb65539
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c
@@ -0,0 +1,913 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> /* AVX2 */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
+
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+ const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
+
+ q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+ q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+ q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+ q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8((int8_t)0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
+ abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
+
+ /* Filter1 >> 3 */
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+ filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ flat = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
+
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
+ _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
+
+ q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
+ _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+ _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+ _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+ _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+ _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+ _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+ _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+ _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
+ }
+}
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+ 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+ 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
+void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+ __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
+ p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+ const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
+
+ p256_4 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch)));
+ p256_3 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch)));
+ p256_2 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch)));
+ p256_1 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch)));
+ p256_0 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch)));
+ q256_0 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch)));
+ q256_1 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch)));
+ q256_2 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch)));
+ q256_3 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch)));
+ q256_4 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch)));
+
+ p4 = _mm256_castsi256_si128(p256_4);
+ p3 = _mm256_castsi256_si128(p256_3);
+ p2 = _mm256_castsi256_si128(p256_2);
+ p1 = _mm256_castsi256_si128(p256_1);
+ p0 = _mm256_castsi256_si128(p256_0);
+ q0 = _mm256_castsi256_si128(q256_0);
+ q1 = _mm256_castsi256_si128(q256_1);
+ q2 = _mm256_castsi256_si128(q256_2);
+ q3 = _mm256_castsi256_si128(q256_3);
+ q4 = _mm256_castsi256_si128(q256_4);
+
+ {
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+ __m128i work;
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+ flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
+ flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ /* Filter1 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ /* Filter2 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+ _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ p256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch)));
+ q256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch)));
+ p5 = _mm256_castsi256_si128(p256_5);
+ q5 = _mm256_castsi256_si128(q256_5);
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+ _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ p256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch)));
+ q256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch)));
+ p6 = _mm256_castsi256_si128(p256_6);
+ q6 = _mm256_castsi256_si128(q256_6);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+ _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+
+ p256_7 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch)));
+ q256_7 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch)));
+ p7 = _mm256_castsi256_si128(p256_7);
+ q7 = _mm256_castsi256_si128(q256_7);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+ _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m256i eight = _mm256_set1_epi16(8);
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+ pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ const __m256i filter =
+ _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+ p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+ p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+ p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+ p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+ p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+ q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+ q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+ q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+ q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+ q256_7 = _mm256_shuffle_epi8(q256_7, filter);
+
+ pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+ _mm256_add_epi16(p256_4, p256_3));
+ pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+ _mm256_add_epi16(q256_4, q256_3));
+
+ pixetFilter_p2p1p0 =
+ _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 =
+ _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+ pixelFilter_p = _mm256_add_epi16(
+ eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(
+ four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
+
+ flat2_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
+
+ flat2_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(p256_3, p256_0)),
+ 3);
+
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(q256_3, q256_0)),
+ 3);
+
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+ flat2_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+ flat2_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_1)),
+ 3);
+
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_1)),
+ 3);
+
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+ flat2_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+ flat2_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_2)),
+ 3);
+
+ flat_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_2)),
+ 3);
+
+ flat_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+ flat2_p3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+ flat2_q3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+ flat2_p4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+ flat2_q4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+ flat2_p5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+ flat2_q5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+ flat2_p6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+ flat2_q6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+ }
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ p2 = _mm_andnot_si128(flat, p2);
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ p2 = _mm_or_si128(flat_p2, p2);
+
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(flat_q2, q2);
+
+ p6 = _mm_andnot_si128(flat2, p6);
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ p6 = _mm_or_si128(flat2_p6, p6);
+ _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ p5 = _mm_or_si128(flat2_p5, p5);
+ _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ p4 = _mm_or_si128(flat2_p4, p4);
+ _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ p3 = _mm_or_si128(flat2_p3, p3);
+ _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ p2 = _mm_or_si128(flat2_p2, p2);
+ _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ p1 = _mm_or_si128(flat2_p1, p1);
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ p0 = _mm_or_si128(flat2_p0, p0);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+
+ q0 = _mm_andnot_si128(flat2, q0);
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ q0 = _mm_or_si128(flat2_q0, q0);
+ _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0);
+
+ q1 = _mm_andnot_si128(flat2, q1);
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ q1 = _mm_or_si128(flat2_q1, q1);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+
+ q2 = _mm_andnot_si128(flat2, q2);
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ q2 = _mm_or_si128(flat2_q2, q2);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
+
+ q3 = _mm_andnot_si128(flat2, q3);
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ q3 = _mm_or_si128(flat2_q3, q3);
+ _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
+
+ q4 = _mm_andnot_si128(flat2, q4);
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ q4 = _mm_or_si128(flat2_q4, q4);
+ _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
+
+ q5 = _mm_andnot_si128(flat2, q5);
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ q5 = _mm_or_si128(flat2_q5, q5);
+ _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
+
+ q6 = _mm_andnot_si128(flat2, q6);
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ q6 = _mm_or_si128(flat2_q6, q6);
+ _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c
new file mode 100644
index 0000000000..6ea34cdd16
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c
@@ -0,0 +1,1779 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+ return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+// filter_mask and hev_mask
+#define FILTER_HEV_MASK \
+ do { \
+ /* (abs(q1 - q0), abs(p1 - p0) */ \
+ __m128i flat = abs_diff(q1p1, q0p0); \
+ /* abs(p1 - q1), abs(p0 - q0) */ \
+ const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
+ __m128i abs_p0q0, abs_p1q1, work; \
+ \
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
+ hev = \
+ _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
+ hev = _mm_cmpgt_epi16(hev, thresh_v); \
+ hev = _mm_packs_epi16(hev, hev); \
+ \
+ /* const int8_t mask = filter_mask(*limit, *blimit, */ \
+ /* p3, p2, p1, p0, q0, q1, q2, q3); */ \
+ abs_p0q0 = \
+ _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \
+ abs_p1q1 = \
+ _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
+ /* abs(p3 - p2), abs(p2 - p1) */ \
+ work = abs_diff(p3p2, p2p1); \
+ flat = _mm_max_epu8(work, flat); \
+ /* abs(q3 - q2), abs(q2 - q1) */ \
+ work = abs_diff(q3q2, q2q1); \
+ flat = _mm_max_epu8(work, flat); \
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
+ mask = _mm_unpacklo_epi64(mask, flat); \
+ mask = _mm_subs_epu8(mask, limit_v); \
+ mask = _mm_cmpeq_epi8(mask, zero); \
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
+ } while (0)
+
+#define FILTER4 \
+ do { \
+ const __m128i t3t4 = \
+ _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80); \
+ __m128i filter, filter2filter1, work; \
+ \
+ ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \
+ qs1qs0 = _mm_xor_si128(q1q0, t80); \
+ \
+ /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \
+ work = _mm_subs_epi8(ps1ps0, qs1qs0); \
+ filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \
+ /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \
+ filter = _mm_subs_epi8(filter, work); \
+ filter = _mm_subs_epi8(filter, work); \
+ filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \
+ filter = _mm_and_si128(filter, mask); /* & mask */ \
+ filter = _mm_unpacklo_epi64(filter, filter); \
+ \
+ /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \
+ /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \
+ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \
+ filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \
+ filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \
+ filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \
+ filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \
+ \
+ /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \
+ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \
+ filter = _mm_unpacklo_epi8(filter, filter); \
+ filter = _mm_srai_epi16(filter, 9); /* round */ \
+ filter = _mm_packs_epi16(filter, filter); \
+ filter = _mm_andnot_si128(hev, filter); \
+ \
+ hev = _mm_unpackhi_epi64(filter2filter1, filter); \
+ filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \
+ \
+ /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
+ qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \
+ /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
+ ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \
+ qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \
+ ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
+ } while (0)
+
+void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i limit_v =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+ _mm_loadl_epi64((const __m128i *)limit));
+ const __m128i thresh_v =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+ __m128i mask, hev;
+
+ p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s - 4 * pitch)));
+ q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+ q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 0 * pitch)));
+ q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
+ p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+ q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+ q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+
+ FILTER_HEV_MASK;
+ FILTER4;
+
+ _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0)); // *op1
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0); // *op0
+ _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0); // *oq0
+ _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0)); // *oq1
+}
+
+void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i limit_v =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+ _mm_loadl_epi64((const __m128i *)limit));
+ const __m128i thresh_v =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i x0, x1, x2, x3;
+ __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+ __m128i mask, hev;
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4)));
+
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4)));
+
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4)));
+
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4)));
+
+ // Transpose 8x8
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ p1p0 = _mm_unpacklo_epi16(q1q0, x1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ x0 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ p3p2 = _mm_unpacklo_epi32(p1p0, x0);
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ p1p0 = _mm_unpackhi_epi32(p1p0, x0);
+ p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high
+ p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high
+
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ q1q0 = _mm_unpackhi_epi16(q1q0, x1);
+ // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ x2 = _mm_unpackhi_epi16(x2, x3);
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ q3q2 = _mm_unpackhi_epi32(q1q0, x2);
+ // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ q1q0 = _mm_unpacklo_epi32(q1q0, x2);
+
+ q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
+ q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
+ p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+ q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+
+ FILTER_HEV_MASK;
+ FILTER4;
+
+ // Transpose 8x4 to 4x8
+ // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37
+ // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
+ // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37
+ x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
+ // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27
+ ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
+
+ storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+ ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+ storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+ ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+ storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+ ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+ storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+
+ storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+ qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+ storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+ qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+ storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+ qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+ storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+}
+
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
+ __m128i mask, hev, flat, flat2;
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
+
+ q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+ q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+ q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+ q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8((int8_t)0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 = abs_diff(q0p0, p0q0);
+ abs_p1q1 = abs_diff(q1p1, p1q1);
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
+
+ // Filter1 >> 3
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+ filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
+ flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
+
+ q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
+ work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+ _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+ _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+ _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+ _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+ _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+ _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+ _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
+ }
+}
+
+static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+ const __m128i *const a1,
+ const __m128i *const a2,
+ const __m128i *const s1,
+ const __m128i *const s2) {
+ __m128i x = _mm_add_epi16(*a1, *total);
+ x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
+ return x;
+}
+
+static INLINE __m128i filter8_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f8_lo,
+ const __m128i *const f8_hi) {
+ const __m128i f8 =
+ _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
+ const __m128i result = _mm_and_si128(*flat, f8);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static INLINE __m128i filter16_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f_lo,
+ const __m128i *const f_hi) {
+ const __m128i f =
+ _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
+ const __m128i result = _mm_and_si128(*flat, f);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
+ __m128i mask, hev, flat, flat2;
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+
+ __m128i op2, op1, op0, oq0, oq1, oq2;
+
+ __m128i max_abs_p1p0q1q0;
+
+ p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch));
+ p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch));
+ p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch));
+ p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch));
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+ q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+ q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+ q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+ q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+ {
+ const __m128i abs_p1p0 = abs_diff(p1, p0);
+ const __m128i abs_q1q0 = abs_diff(q1, q0);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i abs_p0q0 = abs_diff(p0, q0);
+ __m128i abs_p1q1 = abs_diff(p1, q1);
+ __m128i work;
+ max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+ flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+ work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ op1 = _mm_xor_si128(p1, t80);
+ op0 = _mm_xor_si128(p0, t80);
+ oq0 = _mm_xor_si128(q0, t80);
+ oq1 = _mm_xor_si128(q1, t80);
+
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+ work_a = _mm_subs_epi8(oq0, op0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+ oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+ // loopfilter done
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter8
+ {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+ const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+ const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+ __m128i f8_lo, f8_hi;
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+ _mm_add_epi16(p3_lo, p2_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+ _mm_add_epi16(p3_hi, p2_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+ op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+ op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+ op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+ oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+ oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+ oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
+ const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
+ const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
+ const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
+ const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+ const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+ const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
+ const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
+ const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
+ const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
+
+ const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
+ const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
+ const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
+ const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
+ const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+ const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
+ const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
+ const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
+ const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
+
+ __m128i f_lo;
+ __m128i f_hi;
+
+ f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
+ f_lo =
+ _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
+ f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
+ f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
+
+ f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
+ f_hi =
+ _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
+ f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
+ f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
+
+ p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
+
+ f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
+ p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
+
+ f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
+ p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
+
+ f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
+ p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
+
+ f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
+ op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2);
+
+ f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
+ op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
+ op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
+ oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
+ oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
+ oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
+ q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
+ q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
+ q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
+ q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ }
+}
+
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
+ __m128i mask, hev, flat;
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
+
+ q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
+ q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 2 * pitch)));
+ q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+ q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s - 0 * pitch)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ // filter_mask and hev_mask
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+ abs_p0q0 = abs_diff(q0p0, p0q0);
+ abs_p1q1 = abs_diff(q1p1, p1q1);
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // flat_mask4
+
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ }
+
+ {
+ const __m128i four = _mm_set1_epi16(4);
+ unsigned char *src = s;
+ {
+ __m128i workp_a, workp_b, workp_shft;
+ p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+ zero);
+ p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+ zero);
+ p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+ zero);
+ p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+ zero);
+ q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+ zero);
+ q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+ zero);
+ q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+ zero);
+ q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+ zero);
+
+ workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op2[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op1[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op0[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq0[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq1[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq2[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+ }
+ }
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 11);
+ filter1 = _mm_packs_epi16(filter1, filter1);
+
+ // Filter2 >> 3
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 11);
+ filter2 = _mm_packs_epi16(filter2, zero);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ filt = _mm_unpacklo_epi8(zero, filt);
+ filt = _mm_srai_epi16(filt, 9);
+ filt = _mm_packs_epi16(filt, zero);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ p0 = _mm_loadl_epi64((__m128i *)flat_op0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ p1 = _mm_loadl_epi64((__m128i *)flat_op1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p2 = _mm_loadl_epi64((__m128i *)flat_op2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+
+ _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2);
+ _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1);
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1);
+ _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2);
+ }
+}
+
+void vpx_lpf_horizontal_8_dual_sse2(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i blimit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+ _mm_load_si128((const __m128i *)blimit1));
+ const __m128i limit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+ _mm_load_si128((const __m128i *)limit1));
+ const __m128i thresh =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+ _mm_load_si128((const __m128i *)thresh1));
+
+ __m128i mask, hev, flat;
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+ {
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+ __m128i work;
+
+ // filter_mask and hev_mask
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // flat_mask4
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ }
+ {
+ const __m128i four = _mm_set1_epi16(4);
+ unsigned char *src = s;
+ int i = 0;
+
+ do {
+ __m128i workp_a, workp_b, workp_shft;
+ p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+ zero);
+ p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+ zero);
+ p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+ zero);
+ p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+ zero);
+ q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+ zero);
+ q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+ zero);
+ q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+ zero);
+ q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+ zero);
+
+ workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ src += 8;
+ } while (++i < 2);
+ }
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ q0 = _mm_load_si128((__m128i *)flat_oq0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ q1 = _mm_load_si128((__m128i *)flat_oq1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q2 = _mm_load_si128((__m128i *)flat_oq2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ p0 = _mm_load_si128((__m128i *)flat_op0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ p1 = _mm_load_si128((__m128i *)flat_op1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p2 = _mm_load_si128((__m128i *)flat_op2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+
+ _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
+ }
+}
+
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit0,
+ const unsigned char *limit0,
+ const unsigned char *thresh0,
+ const unsigned char *blimit1,
+ const unsigned char *limit1,
+ const unsigned char *thresh1) {
+ const __m128i blimit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+ _mm_load_si128((const __m128i *)blimit1));
+ const __m128i limit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+ _mm_load_si128((const __m128i *)limit1));
+ const __m128i thresh =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+ _mm_load_si128((const __m128i *)thresh1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ __m128i mask, hev, flat;
+
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+
+ // filter_mask and hev_mask
+ {
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+ __m128i work;
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+ }
+}
+
+static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
+ int in_p, unsigned char *out, int out_p) {
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i x8, x9, x10, x11, x12, x13, x14, x15;
+
+ // 2-way interleave w/hoisting of unpacks
+ x0 = _mm_loadl_epi64((__m128i *)in0); // 1
+ x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3
+ x0 = _mm_unpacklo_epi8(x0, x1); // 1
+
+ x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5
+ x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7
+ x1 = _mm_unpacklo_epi8(x2, x3); // 2
+
+ x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9
+ x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11
+ x2 = _mm_unpacklo_epi8(x4, x5); // 3
+
+ x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13
+ x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15
+ x3 = _mm_unpacklo_epi8(x6, x7); // 4
+ x4 = _mm_unpacklo_epi16(x0, x1); // 9
+
+ x8 = _mm_loadl_epi64((__m128i *)in1); // 2
+ x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4
+ x8 = _mm_unpacklo_epi8(x8, x9); // 5
+ x5 = _mm_unpacklo_epi16(x2, x3); // 10
+
+ x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6
+ x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8
+ x9 = _mm_unpacklo_epi8(x10, x11); // 6
+
+ x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10
+ x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12
+ x10 = _mm_unpacklo_epi8(x12, x13); // 7
+ x12 = _mm_unpacklo_epi16(x8, x9); // 11
+
+ x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14
+ x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16
+ x11 = _mm_unpacklo_epi8(x14, x15); // 8
+ x13 = _mm_unpacklo_epi16(x10, x11); // 12
+
+ x6 = _mm_unpacklo_epi32(x4, x5); // 13
+ x7 = _mm_unpackhi_epi32(x4, x5); // 14
+ x14 = _mm_unpacklo_epi32(x12, x13); // 15
+ x15 = _mm_unpackhi_epi32(x12, x13); // 16
+
+ // Store first 4-line result
+ _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
+ _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ x12 = _mm_unpackhi_epi16(x8, x9);
+ x13 = _mm_unpackhi_epi16(x10, x11);
+
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ x14 = _mm_unpacklo_epi32(x12, x13);
+ x15 = _mm_unpackhi_epi32(x12, x13);
+
+ // Store second 4-line result
+ _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
+ _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+}
+
+static INLINE void transpose(unsigned char *src[], int in_p,
+ unsigned char *dst[], int out_p,
+ int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ do {
+ unsigned char *in = src[idx8x8];
+ unsigned char *out = dst[idx8x8];
+
+ x0 =
+ _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
+ x1 =
+ _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ x0 = _mm_unpacklo_epi8(x0, x1);
+
+ x2 =
+ _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
+ x3 =
+ _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ x1 = _mm_unpacklo_epi8(x2, x3);
+
+ x4 =
+ _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
+ x5 =
+ _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ x2 = _mm_unpacklo_epi8(x4, x5);
+
+ x6 =
+ _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
+ x7 =
+ _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ x3 = _mm_unpacklo_epi8(x6, x7);
+
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ x4 = _mm_unpacklo_epi16(x0, x1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ x5 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70
+ mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72
+ mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73
+
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74
+ mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi32(x4, x5);
+
+ mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76
+ mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+ unsigned char *src[2];
+ unsigned char *dst[2];
+
+ // Transpose 8x16
+ transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ vpx_lpf_horizontal_4_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+ dst[0] = s - 4;
+ dst[1] = s - 4 + pitch * 8;
+
+ // Transpose back
+ transpose(src, 16, dst, pitch, 2);
+}
+
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
+ unsigned char *src[1];
+ unsigned char *dst[1];
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ transpose(src, pitch, dst, 8, 1);
+
+ // Loop filtering
+ vpx_lpf_horizontal_8(t_dst + 4 * 8, 8, blimit, limit, thresh);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ transpose(src, 8, dst, pitch, 1);
+}
+
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+ unsigned char *src[2];
+ unsigned char *dst[2];
+
+ // Transpose 8x16
+ transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ vpx_lpf_horizontal_8_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+
+ dst[0] = s - 4;
+ dst[1] = s - 4 + pitch * 8;
+
+ // Transpose back
+ transpose(src, 16, dst, pitch, 2);
+}
+
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
+ unsigned char *src[2];
+ unsigned char *dst[2];
+
+ src[0] = s - 8;
+ src[1] = s;
+ dst[0] = t_dst;
+ dst[1] = t_dst + 8 * 8;
+
+ // Transpose 16x8
+ transpose(src, pitch, dst, 8, 2);
+
+ // Loop filtering
+ vpx_lpf_horizontal_16(t_dst + 8 * 8, 8, blimit, limit, thresh);
+
+ src[0] = t_dst;
+ src[1] = t_dst + 8 * 8;
+ dst[0] = s - 8;
+ dst[1] = s;
+
+ // Transpose back
+ transpose(src, 8, dst, pitch, 2);
+}
+
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+ // Transpose 16x16
+ transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+ transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
+
+ // Loop filtering
+ vpx_lpf_horizontal_16_dual(t_dst + 8 * 16, 16, blimit, limit, thresh);
+
+ // Transpose back
+ transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+ transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h
new file mode 100644
index 0000000000..031f361a41
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_MEM_SSE2_H_
+#define VPX_VPX_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+#include <string.h>
+
+#include "./vpx_config.h"
+
+static INLINE void storeu_int32(void *dst, int32_t v) {
+ memcpy(dst, &v, sizeof(v));
+}
+
+static INLINE int32_t loadu_int32(const void *src) {
+ int32_t v;
+ memcpy(&v, src, sizeof(v));
+ return v;
+}
+
+static INLINE __m128i load_unaligned_u32(const void *a) {
+ int val;
+ memcpy(&val, a, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+static INLINE void store_unaligned_u32(void *const a, const __m128i v) {
+ const int val = _mm_cvtsi128_si32(v);
+ memcpy(a, &val, sizeof(val));
+}
+
+#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
+#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
+
+static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) {
+ return _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
+static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
+ d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
+ d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
+ d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ load_8bit_4x4(s + 0 * stride, stride, &d[0]);
+ load_8bit_4x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ load_8bit_8x4(s + 0 * stride, stride, &d[0]);
+ load_8bit_8x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_16x8(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
+ d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
+ d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
+ d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
+ d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+ loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+ _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
+}
+
+static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
+ *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
+ *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
+ *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
+}
+
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ __m128i ss[4];
+
+ ss[0] = s;
+ ss[1] = _mm_srli_si128(s, 4);
+ ss[2] = _mm_srli_si128(s, 8);
+ ss[3] = _mm_srli_si128(s, 12);
+ store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+ uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+ _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+ _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
+static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
+ _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
+ _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
+ _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
+ _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
+}
+
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
+#endif // VPX_VPX_DSP_X86_MEM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c
new file mode 100644
index 0000000000..119fa7cd1a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+extern const int16_t vpx_rv[];
+
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
+ int cols, int flimit) {
+ int col;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i f = _mm_set1_epi32(flimit);
+ DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);
+
+ // 8 columns are processed at a time.
+ // If rows is less than 8 the bottom border extension fails.
+ assert(cols % 8 == 0);
+ assert(rows >= 8);
+
+ for (col = 0; col < cols; col += 8) {
+ int row, i;
+ __m128i s = _mm_loadl_epi64((__m128i *)dst);
+ __m128i sum, sumsq_0, sumsq_1;
+ __m128i tmp_0, tmp_1;
+ __m128i below_context = _mm_setzero_si128();
+
+ s = _mm_unpacklo_epi8(s, zero);
+
+ for (i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)above_context + i, s);
+ }
+
+ // sum *= 9
+ sum = _mm_slli_epi16(s, 3);
+ sum = _mm_add_epi16(s, sum);
+
+ // sum^2 * 9 == (sum * 9) * sum
+ tmp_0 = _mm_mullo_epi16(sum, s);
+ tmp_1 = _mm_mulhi_epi16(sum, s);
+
+ sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
+ sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);
+
+ // Prime sum/sumsq
+ for (i = 1; i <= 6; ++i) {
+ __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
+ a = _mm_unpacklo_epi8(a, zero);
+ sum = _mm_add_epi16(sum, a);
+ a = _mm_mullo_epi16(a, a);
+ sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
+ sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
+ }
+
+ for (row = 0; row < rows + 8; row++) {
+ const __m128i above =
+ _mm_load_si128((__m128i *)above_context + (row & 7));
+ __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch));
+ __m128i above_sq, below_sq;
+ __m128i mask_0, mask_1;
+ __m128i multmp_0, multmp_1;
+ __m128i rv;
+ __m128i out;
+
+ this_row = _mm_unpacklo_epi8(this_row, zero);
+
+ if (row + 7 < rows) {
+ // Instead of copying the end context we just stop loading when we get
+ // to the last one.
+ below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
+ below_context = _mm_unpacklo_epi8(below_context, zero);
+ }
+
+ sum = _mm_sub_epi16(sum, above);
+ sum = _mm_add_epi16(sum, below_context);
+
+ // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
+ // extend. Unfortunately we can't do below_sq - above_sq in 16 bits
+ // because x86 does not have unpack with sign extension.
+ above_sq = _mm_mullo_epi16(above, above);
+ sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
+ sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));
+
+ below_sq = _mm_mullo_epi16(below_context, below_context);
+ sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
+ sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));
+
+ // sumsq * 16 - sumsq == sumsq * 15
+ mask_0 = _mm_slli_epi32(sumsq_0, 4);
+ mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
+ mask_1 = _mm_slli_epi32(sumsq_1, 4);
+ mask_1 = _mm_sub_epi32(mask_1, sumsq_1);
+
+ multmp_0 = _mm_mullo_epi16(sum, sum);
+ multmp_1 = _mm_mulhi_epi16(sum, sum);
+
+ mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
+ mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));
+
+ // mask - f gives a negative value when mask < f
+ mask_0 = _mm_sub_epi32(mask_0, f);
+ mask_1 = _mm_sub_epi32(mask_1, f);
+
+ // Shift the sign bit down to create a mask
+ mask_0 = _mm_srai_epi32(mask_0, 31);
+ mask_1 = _mm_srai_epi32(mask_1, 31);
+
+ mask_0 = _mm_packs_epi32(mask_0, mask_1);
+
+ rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));
+
+ mask_1 = _mm_add_epi16(rv, sum);
+ mask_1 = _mm_add_epi16(mask_1, this_row);
+ mask_1 = _mm_srai_epi16(mask_1, 4);
+
+ mask_1 = _mm_and_si128(mask_0, mask_1);
+ mask_0 = _mm_andnot_si128(mask_0, this_row);
+ out = _mm_or_si128(mask_1, mask_0);
+
+ _mm_storel_epi64((__m128i *)(dst + row * pitch),
+ _mm_packus_epi16(out, zero));
+
+ _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
+ }
+
+ dst += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c
new file mode 100644
index 0000000000..6837a5cf28
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m256i big_zero = _mm256_setzero_si256();
+ int index;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan;
+
+ *eob_ptr = 0;
+
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (n_coeffs == 16) return;
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < n_coeffs; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ continue;
+ }
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *scan_order) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m256i big_zero = _mm256_setzero_si256();
+ int index;
+ const int16_t *iscan = scan_order->iscan;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+ &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC.
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs.
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < 32 * 32; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ continue;
+ }
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
+ dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c
new file mode 100644
index 0000000000..3d97b3fdae
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void load_b_values_avx2(
+ const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+ __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+ const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+ __m256i *shift, int log_scale) {
+ *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+ *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *zbin = _mm256_add_epi16(*zbin, rnd);
+ *zbin = _mm256_srai_epi16(*zbin, log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+ // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
+ *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+
+ *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ *round = _mm256_permute4x64_epi64(*round, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *round = _mm256_add_epi16(*round, rnd);
+ *round = _mm256_srai_epi16(*round, log_scale);
+ }
+
+ *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+ *dequant =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+ *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+ *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+ *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i
+load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // typedef int32_t tran_low_t;
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8));
+ return _mm256_packs_epi32(coeff1, coeff2);
+#else
+ // typedef int16_t tran_low_t;
+ return _mm256_loadu_si256((const __m256i *)coeff_ptr);
+#endif
+}
+
+static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals,
+ tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // typedef int32_t tran_low_t;
+ __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+ __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+ __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+ _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+ _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+#else
+ // typedef int16_t tran_low_t;
+ _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals);
+#endif
+}
+
+static VPX_FORCE_INLINE __m256i
+quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+ __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return _mm256_setzero_si256();
+ }
+ {
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
+ const __m256i v_nz_mask =
+ _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+ const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant);
+
+ const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high);
+ const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high);
+#else
+ const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+#endif
+
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+ store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+#endif
+ return v_nz_mask;
+ }
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+ __m256i v_eobmax,
+ __m256i v_mask) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i v_iscan = _mm256_permute4x64_epi64(
+ _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
+#else
+ const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+#endif
+ const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+ return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+ __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+ __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
+ __m256i v_eobmax = _mm256_setzero_si256();
+ intptr_t count;
+ (void)scan;
+
+ load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+ &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+ &v_quant_shift, 0);
+ // Do DC and first 15 AC.
+ v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (count = n_coeffs - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant,
+ __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin,
+ __m256i *v_quant_shift, __m256i *v_eobmax) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif
+ return *v_eobmax;
+ }
+ {
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+ // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ // quant_shift_ptr[rc != 0]) >> 15);
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32_hi =
+ _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1);
+ const __m256i v_tmp32_lo =
+ _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15);
+ const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+ const __m256i v_sign_lo =
+ _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff);
+ const __m256i v_sign_hi =
+ _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff);
+ const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant);
+ const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant);
+ const __m256i v_dqcoeff_lo = _mm256_sign_epi32(
+ _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo);
+ const __m256i v_dqcoeff_hi = _mm256_sign_epi32(
+ _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi);
+ const __m256i v_nz_mask =
+ _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+ store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi),
+ dqcoeff_ptr);
+#endif
+
+ return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask);
+ }
+}
+
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *scan_order) {
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+ __m256i v_eobmax = _mm256_setzero_si256();
+ intptr_t count;
+ const int16_t *iscan = scan_order->iscan;
+
+ load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
+ mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
+ mb_plane->quant_shift, &v_quant_shift, 1);
+
+ // Do DC and first 15 AC.
+ v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+ &v_quant, &v_dequant, &v_round, &v_zbin,
+ &v_quant_shift, &v_eobmax);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (count = (32 * 32) - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+ &v_quant, &v_dequant, &v_round, &v_zbin,
+ &v_quant_shift, &v_eobmax);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c
new file mode 100644
index 0000000000..9533e7916d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
+
+ (void)scan;
+
+ // Setup global values.
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h
new file mode 100644
index 0000000000..fe42fee018
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
+#define VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+ const int16_t *round_ptr, __m128i *round,
+ const int16_t *quant_ptr, __m128i *quant,
+ const int16_t *dequant_ptr, __m128i *dequant,
+ const int16_t *shift_ptr, __m128i *shift) {
+ *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ *round = _mm_load_si128((const __m128i *)round_ptr);
+ *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+static INLINE void load_b_values32x32(
+ const struct macroblock_plane *const mb_plane, __m128i *zbin,
+ __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+ __m128i *dequant, __m128i *shift) {
+ const __m128i one = _mm_set1_epi16(1);
+ // The 32x32 halves zbin and round.
+ *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+ // Shift with rounding.
+ *zbin = _mm_add_epi16(*zbin, one);
+ *zbin = _mm_srli_epi16(*zbin, 1);
+ // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+ // it is a strict "greater" comparison.
+ *zbin = _mm_sub_epi16(*zbin, one);
+
+ *round = _mm_load_si128((const __m128i *)mb_plane->round);
+ *round = _mm_add_epi16(*round, one);
+ *round = _mm_srli_epi16(*round, 1);
+
+ *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+ // I suspect this is not technically OK because quant_shift can be up
+ // to 1 << 16 and shifting up again will outrange that, but the test is not
+ // comprehensive enough to catch that and "it's been that way forever"
+ *shift = _mm_slli_epi16(*shift, 1);
+}
+
+static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
+ const int16_t *quant_ptr, __m128i *quant,
+ const int16_t *dequant_ptr,
+ __m128i *dequant) {
+ *round = _mm_load_si128((const __m128i *)round_ptr);
+ *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi16(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+ const __m128i quant, const __m128i shift) {
+ __m128i tmp, qcoeff;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+ tran_low_t *dqcoeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i low = _mm_mullo_epi16(qcoeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(qcoeff, dequant);
+
+ const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+#else
+ const __m128i dqcoeff16 = _mm_mullo_epi16(qcoeff, dequant);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+// Scan 16 values for eob reference in scan.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+ const int16_t *scan, const int index,
+ const __m128i zero) {
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+ __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
+ __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
+ __m128i eob0, eob1;
+ eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+ eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+ return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+#endif // VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c
new file mode 100644
index 0000000000..641f23298b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
+
+ (void)scan;
+
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *scan_order) {
+ const __m128i zero = _mm_setzero_si128();
+ int index;
+ const int16_t *iscan = scan_order->iscan;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+ &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC.
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+#endif // CONFIG_HIGHBITDEPTH
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs.
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < 32 * 32; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ continue;
+ }
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8 + index);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h
new file mode 100644
index 0000000000..e8d2a05771
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
+#define VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+
+static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff,
+ const __m128i dequant,
+ const __m128i zero,
+ tran_low_t *dqcoeff) {
+ // Un-sign to bias rounding like C.
+ const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+ const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+ const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+ const __m128i low = _mm_mullo_epi16(coeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+ __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ // "Divide" by 2.
+ dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 1);
+ dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 1);
+
+ dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+ dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+#else
+ _mm_store_si128((__m128i *)(dqcoeff),
+ _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+#endif // VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c
new file mode 100644
index 0000000000..cf7111983b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h> // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// Note with sums[4] some versions of Visual Studio may fail due to parameter
+// alignment, though the functions should be equivalent:
+// error C2719: 'sums': formal parameter with requested alignment of 32 won't be
+// aligned
+static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+ uint32_t sad_array[4]) {
+ const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+ const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+ const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+ const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+ _mm256_extractf128_si256(t2, 1));
+ _mm_storeu_si128((__m128i *)sad_array, sum);
+}
+
+static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, int h,
+ uint32_t sad_array[4]) {
+ int i;
+ const uint8_t *refs[4];
+ __m256i sums[4];
+
+ refs[0] = ref_array[0];
+ refs[1] = ref_array[1];
+ refs[2] = ref_array[2];
+ refs[3] = ref_array[3];
+ sums[0] = _mm256_setzero_si256();
+ sums[1] = _mm256_setzero_si256();
+ sums[2] = _mm256_setzero_si256();
+ sums[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i++) {
+ __m256i r[4];
+
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+ // sum of the absolute differences between every ref[] to src
+ r[0] = _mm256_sad_epu8(r[0], s);
+ r[1] = _mm256_sad_epu8(r[1], s);
+ r[2] = _mm256_sad_epu8(r[2], s);
+ r[3] = _mm256_sad_epu8(r[3], s);
+
+ // sum every ref[]
+ sums[0] = _mm256_add_epi32(sums[0], r[0]);
+ sums[1] = _mm256_add_epi32(sums[1], r[1]);
+ sums[2] = _mm256_add_epi32(sums[2], r[2]);
+ sums[3] = _mm256_add_epi32(sums[3], r[3]);
+
+ src_ptr += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+
+ calc_final_4(sums, sad_array);
+}
+
+static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, int h,
+ uint32_t sad_array[4]) {
+ __m256i sums[4];
+ int i;
+ const uint8_t *refs[4];
+
+ refs[0] = ref_array[0];
+ refs[1] = ref_array[1];
+ refs[2] = ref_array[2];
+ refs[3] = ref_array[3];
+ sums[0] = _mm256_setzero_si256();
+ sums[1] = _mm256_setzero_si256();
+ sums[2] = _mm256_setzero_si256();
+ sums[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i++) {
+ __m256i r_lo[4], r_hi[4];
+ // load 64 bytes from src and all ref[]
+ const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
+ const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
+ r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
+ r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32));
+ r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32));
+ r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+ r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
+
+ // sum of the absolute differences between every ref[] to src
+ r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
+ r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
+ r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
+ r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo);
+ r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi);
+ r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi);
+ r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
+ r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
+
+ // sum every ref[]
+ sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
+ sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
+ sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
+ sums[3] = _mm256_add_epi32(sums[3], r_lo[3]);
+ sums[0] = _mm256_add_epi32(sums[0], r_hi[0]);
+ sums[1] = _mm256_add_epi32(sums[1], r_hi[1]);
+ sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
+ sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
+
+ src_ptr += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+
+ calc_final_4(sums, sad_array);
+}
+
+#define SAD64_H(h) \
+ void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+ }
+
+#define SAD32_H(h) \
+ void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+ }
+
+SAD64_H(64)
+SAD32_H(32)
+
+#define SADS64_H(h) \
+ void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ ((h) >> 1), sad_array); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+#define SADS32_H(h) \
+ void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ ((h) >> 1), sad_array); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+SADS64_H(64)
+SADS64_H(32)
+
+SADS32_H(64)
+SADS32_H(32)
+SADS32_H(16)
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c
new file mode 100644
index 0000000000..cfd23fedd9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h> // AVX512
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
+ uint32_t sad_array[4]) {
+ __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+ __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+ __m512i sum_mlow, sum_mhigh;
+ int i;
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+ ref0 = ref_array[0];
+ ref1 = ref_array[1];
+ ref2 = ref_array[2];
+ ref3 = ref_array[3];
+ sum_ref0 = _mm512_set1_epi16(0);
+ sum_ref1 = _mm512_set1_epi16(0);
+ sum_ref2 = _mm512_set1_epi16(0);
+ sum_ref3 = _mm512_set1_epi16(0);
+ for (i = 0; i < 64; i++) {
+ // load src and all ref[]
+ src_reg = _mm512_loadu_si512((const __m512i *)src_ptr);
+ ref0_reg = _mm512_loadu_si512((const __m512i *)ref0);
+ ref1_reg = _mm512_loadu_si512((const __m512i *)ref1);
+ ref2_reg = _mm512_loadu_si512((const __m512i *)ref2);
+ ref3_reg = _mm512_loadu_si512((const __m512i *)ref3);
+ // sum of the absolute differences between every ref[] to src
+ ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg);
+ ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg);
+ // sum every ref[]
+ sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg);
+ sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg);
+
+ src_ptr += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+ {
+ __m256i sum256;
+ __m128i sum128;
+ // in sum_ref[] the result is saved in the first 4 bytes
+ // the other 4 bytes are zeroed.
+ // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+ sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4);
+ sum_ref3 = _mm512_bslli_epi128(sum_ref3, 4);
+
+ // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+ sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1);
+ sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3);
+
+ // merge every 64 bit from each sum_ref[]
+ sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2);
+ sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2);
+
+ // add the low 64 bit to the high 64 bit
+ sum_mlow = _mm512_add_epi32(sum_mlow, sum_mhigh);
+
+ // add the low 128 bit to the high 128 bit
+ sum256 = _mm256_add_epi32(_mm512_castsi512_si256(sum_mlow),
+ _mm512_extracti32x8_epi32(sum_mlow, 1));
+ sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
+ _mm256_extractf128_si256(sum256, 1));
+
+ _mm_storeu_si128((__m128i *)(sad_array), sum128);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm
new file mode 100644
index 0000000000..ed4ea3ef9b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm
@@ -0,0 +1,278 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_4x2x4 5-6 0
+ movd m0, [srcq +%2]
+%if %1 == 1
+ movd m6, [ref1q+%3]
+ movd m4, [ref2q+%3]
+ movd m7, [ref3q+%3]
+ movd m5, [ref4q+%3]
+ movd m1, [srcq +%4]
+ movd m2, [ref1q+%5]
+ punpckldq m0, m1
+ punpckldq m6, m2
+ movd m1, [ref2q+%5]
+ movd m2, [ref3q+%5]
+ movd m3, [ref4q+%5]
+ punpckldq m4, m1
+ punpckldq m7, m2
+ punpckldq m5, m3
+ movlhps m0, m0
+ movlhps m6, m4
+ movlhps m7, m5
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movd m1, [ref1q+%3]
+ movd m5, [ref1q+%5]
+ movd m2, [ref2q+%3]
+ movd m4, [ref2q+%5]
+ punpckldq m1, m5
+ punpckldq m2, m4
+ movd m3, [ref3q+%3]
+ movd m5, [ref3q+%5]
+ punpckldq m3, m5
+ movd m4, [ref4q+%3]
+ movd m5, [ref4q+%5]
+ punpckldq m4, m5
+ movd m5, [srcq +%4]
+ punpckldq m0, m5
+ movlhps m0, m0
+ movlhps m1, m2
+ movlhps m3, m4
+ psadbw m1, m0
+ psadbw m3, m0
+ paddd m6, m1
+ paddd m7, m3
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_8x2x4 5-6 0
+ movh m0, [srcq +%2]
+%if %1 == 1
+ movh m4, [ref1q+%3]
+ movh m5, [ref2q+%3]
+ movh m6, [ref3q+%3]
+ movh m7, [ref4q+%3]
+ movhps m0, [srcq +%4]
+ movhps m4, [ref1q+%5]
+ movhps m5, [ref2q+%5]
+ movhps m6, [ref3q+%5]
+ movhps m7, [ref4q+%5]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movh m1, [ref1q+%3]
+ movh m2, [ref2q+%3]
+ movh m3, [ref3q+%3]
+ movhps m0, [srcq +%4]
+ movhps m1, [ref1q+%5]
+ movhps m2, [ref2q+%5]
+ movhps m3, [ref3q+%5]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movh m1, [ref4q+%3]
+ movhps m1, [ref4q+%5]
+ paddd m5, m2
+ paddd m6, m3
+ psadbw m1, m0
+ paddd m7, m1
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_16x2x4 5-6 0
+ ; 1st 16 px
+ mova m0, [srcq +%2]
+%if %1 == 1
+ movu m4, [ref1q+%3]
+ movu m5, [ref2q+%3]
+ movu m6, [ref3q+%3]
+ movu m7, [ref4q+%3]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movu m1, [ref1q+%3]
+ movu m2, [ref2q+%3]
+ movu m3, [ref3q+%3]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movu m1, [ref4q+%3]
+ paddd m5, m2
+ paddd m6, m3
+ psadbw m1, m0
+ paddd m7, m1
+%endif
+
+ ; 2nd 16 px
+ mova m0, [srcq +%4]
+ movu m1, [ref1q+%5]
+ movu m2, [ref2q+%5]
+ movu m3, [ref3q+%5]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movu m1, [ref4q+%5]
+ paddd m5, m2
+ paddd m6, m3
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+ psadbw m1, m0
+ paddd m7, m1
+%endmacro
+
+; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_32x2x4 5-6 0
+ PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
+ PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6
+%endmacro
+
+; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_64x2x4 5-6 0
+ PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
+ PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
+%endmacro
+
+; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
+%macro SADNXN4D 2-3 0
+%if %3 == 1 ; skip rows
+%if UNIX64
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+%else ; normal sad
+%if UNIX64
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+%endif
+%if %3 == 1
+ lea src_strided, [2*src_strided]
+ lea ref_strided, [2*ref_strided]
+%endif
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+ PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%if %3 == 1 ; downsample number of rows by 2
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
+ PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+%undef num_rep
+ PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+
+%if %1 > 4
+ pslldq m5, 4
+ pslldq m7, 4
+ por m4, m5
+ por m6, m7
+ mova m5, m4
+ mova m7, m6
+ punpcklqdq m4, m6
+ punpckhqdq m5, m7
+ movifnidn r4, r4mp
+ paddd m4, m5
+%if %3 == 1
+ pslld m4, 1
+%endif
+ movu [r4], m4
+ RET
+%else
+ movifnidn r4, r4mp
+ pshufd m6, m6, 0x08
+ pshufd m7, m7, 0x08
+%if %3 == 1
+ pslld m6, 1
+ pslld m7, 1
+%endif
+ movq [r4+0], m6
+ movq [r4+8], m7
+ RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+SADNXN4D 64, 64
+SADNXN4D 64, 32
+SADNXN4D 32, 64
+SADNXN4D 32, 32
+SADNXN4D 32, 16
+SADNXN4D 16, 32
+SADNXN4D 16, 16
+SADNXN4D 16, 8
+SADNXN4D 8, 16
+SADNXN4D 8, 8
+SADNXN4D 8, 4
+SADNXN4D 4, 8
+SADNXN4D 4, 4
+
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16, 8, 1
+SADNXN4D 8, 16, 1
+SADNXN4D 8, 8, 1
+SADNXN4D 4, 8, 1
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c
new file mode 100644
index 0000000000..e00494d766
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i, res;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ for (i = 0; i < h; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref_stride;
+ src_ptr += src_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ res = _mm_cvtsi128_si32(sum_sad128);
+ return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i, res;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ const int ref2_stride = ref_stride << 1;
+ const int src2_stride = src_stride << 1;
+ const int max = h >> 1;
+ for (i = 0; i < max; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref2_stride;
+ src_ptr += src2_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ res = _mm_cvtsi128_si32(sum_sad128);
+ return res;
+}
+
+#define FSAD64_H(h) \
+ unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS64_H(h) \
+ unsigned int vpx_sad_skip_64x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
+ }
+
+#define FSAD32_H(h) \
+ unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS32_H(h) \
+ unsigned int vpx_sad_skip_32x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
+ }
+
+#define FSAD64 \
+ FSAD64_H(64) \
+ FSAD64_H(32) \
+ FSADS64_H(64) \
+ FSADS64_H(32)
+
+#define FSAD32 \
+ FSAD32_H(64) \
+ FSAD32_H(32) \
+ FSAD32_H(16) \
+ FSADS32_H(64) \
+ FSADS32_H(32) \
+ FSADS32_H(16)
+
+FSAD64
+FSAD32
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+#undef FSADS64_H
+#undef FSADS32_H
+
+#define FSADAVG64_H(h) \
+ unsigned int vpx_sad64x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ for (i = 0; i < h; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref_stride; \
+ src_ptr += src_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
+ }
+
+#define FSADAVG32_H(h) \
+ unsigned int vpx_sad32x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ int ref2_stride = ref_stride << 1; \
+ int src2_stride = src_stride << 1; \
+ int max = h >> 1; \
+ for (i = 0; i < max; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, \
+ _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref2_stride; \
+ src_ptr += src2_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
+ }
+
+#define FSADAVG64 \
+ FSADAVG64_H(64) \
+ FSADAVG64_H(32)
+
+#define FSADAVG32 \
+ FSADAVG32_H(64) \
+ FSADAVG32_H(32) \
+ FSADAVG32_H(16)
+
+FSADAVG64
+FSADAVG32
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm
new file mode 100644
index 0000000000..627e463bf8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm
@@ -0,0 +1,332 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
+%macro SAD_FN 4
+%if %4 == 0 ; normal sad
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if VPX_ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea src_strided, [src_strided*2]
+lea ref_strided, [ref_strided*2]
+%endif ; %4 skip
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+ SAD_FN 64, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+32]
+ psadbw m4, [srcq+48]
+ paddd m1, m2
+ paddd m3, m4
+ add refq, ref_strideq
+ paddd m0, m1
+ add srcq, src_strideq
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD64XN 64 ; sad64x64_sse2
+SAD64XN 32 ; sad64x32_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN 64, 2 ; sad64x64_skip_sse2
+SAD64XN 32, 2 ; sad64x32_skip_sse2
+
+; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD32XN 1-2 0
+ SAD_FN 32, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/4
+%else
+ mov n_rowsd, %1/2
+%endif
+ pxor m0, m0
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq]
+ movu m4, [refq+ref_strideq+16]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+src_strideq]
+ psadbw m4, [srcq+src_strideq+16]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD32XN 64 ; sad32x64_sse2
+SAD32XN 32 ; sad32x32_sse2
+SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
+
+; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD16XN 1-2 0
+ SAD_FN 16, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+src_strideq]
+ psadbw m3, [srcq+src_strideq*2]
+ psadbw m4, [srcq+src_stride3q]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD16XN 32 ; sad16x32_sse2
+SAD16XN 16 ; sad16x16_sse2
+SAD16XN 8 ; sad16x8_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN 8, 1 ; sad16x8_avg_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN 8, 2 ; sad16x8_skip_sse2
+
+; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD8XN 1-2 0
+ SAD_FN 8, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movh m1, [refq]
+ movhps m1, [refq+ref_strideq]
+ movh m2, [refq+ref_strideq*2]
+ movhps m2, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ lea second_predq, [second_predq+mmsize*2]
+%endif
+ movh m3, [srcq]
+ movhps m3, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq*2]
+ movhps m4, [srcq+src_stride3q]
+ psadbw m1, m3
+ psadbw m2, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m2
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD8XN 16 ; sad8x16_sse2
+SAD8XN 8 ; sad8x8_sse2
+SAD8XN 4 ; sad8x4_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN 8, 1 ; sad8x8_avg_sse2
+SAD8XN 4, 1 ; sad8x4_avg_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN 8, 2 ; sad8x8_skip_sse2
+
+; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD4XN 1-2 0
+ SAD_FN 4, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movd m1, [refq]
+ movd m2, [refq+ref_strideq]
+ movd m3, [refq+ref_strideq*2]
+ movd m4, [refq+ref_stride3q]
+ punpckldq m1, m2
+ punpckldq m3, m4
+ movlhps m1, m3
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ lea second_predq, [second_predq+mmsize*1]
+%endif
+ movd m2, [srcq]
+ movd m5, [srcq+src_strideq]
+ movd m4, [srcq+src_strideq*2]
+ movd m3, [srcq+src_stride3q]
+ punpckldq m2, m5
+ punpckldq m4, m3
+ movlhps m2, m4
+ psadbw m1, m2
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD4XN 8 ; sad4x8_sse
+SAD4XN 4 ; sad4x4_sse
+SAD4XN 8, 1 ; sad4x8_avg_sse
+SAD4XN 4, 1 ; sad4x4_avg_sse
+SAD4XN 8, 2 ; sad4x8_skip_sse
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
new file mode 100644
index 0000000000..41ffbb07e6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
@@ -0,0 +1,219 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddd xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddd xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddd xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; uint32_t *sum_s,
+; uint32_t *sum_r,
+; uint32_t *sum_sq_s,
+; uint32_t *sum_sq_r,
+; uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(vpx_ssim_parms_16x16_sse2)
+sym(vpx_ssim_parms_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; uint32_t *sum_s,
+; uint32_t *sum_r,
+; uint32_t *sum_sq_s,
+; uint32_t *sum_sq_r,
+; uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(vpx_ssim_parms_8x8_sse2)
+sym(vpx_ssim_parms_8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
new file mode 100644
index 0000000000..d1d8d3460e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -0,0 +1,1467 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+bilin_filter_m_ssse3: times 8 db 16, 0
+ times 8 db 14, 2
+ times 8 db 12, 4
+ times 8 db 10, 6
+ times 16 db 8
+ times 8 db 6, 10
+ times 8 db 4, 12
+ times 8 db 2, 14
+
+SECTION .text
+
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *ref, ptrdiff_t ref_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ paddw %5, %3
+ pmaddwd %3, %3
+ paddw %5, %1
+ pmaddwd %1, %1
+ paddd %6, %3
+ paddd %6, %1
+%endmacro
+
+%macro STORE_AND_RET 1
+%if %1 > 4
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ pcmpgtw m5, m6 ; mask for 0 > x
+ movhlps m3, m7
+ punpcklwd m4, m6, m5
+ punpckhwd m6, m5 ; sign-extend m6 word->dword
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ pshufd m4, m6, 0x1
+ movd [r1], m7 ; store sse
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%else ; 4xh
+ pshuflw m4, m6, 0xe
+ pshuflw m3, m7, 0xe
+ paddw m6, m4
+ paddd m7, m3
+ pcmpgtw m5, m6 ; mask for 0 > x
+ mov r1, ssem ; r1 = unsigned int *sse
+ punpcklwd m6, m5 ; sign-extend m6 word->dword
+ movd [r1], m7 ; store sse
+ pshuflw m4, m6, 0xe
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+%else
+ add srcq, src_strideq
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+
+%if VPX_ARCH_X86_64
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, ref, ref_stride, \
+ second_pred, second_stride, height, sse
+ %define second_str second_strideq
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+ x_offset, y_offset, ref, ref_stride, \
+ height, sse
+ %endif
+ %define block_height heightd
+ %define bilin_filter sseq
+%else
+ %if CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, ref, ref_stride, \
+ second_pred, second_stride, height, sse
+ %define block_height dword heightm
+ %define second_str second_stridemp
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, ref, ref_stride, \
+ height, sse
+ %define block_height heightd
+ %endif
+
+ ; reuse argument stack space
+ %define g_bilin_filterm x_offsetm
+ %define g_pw_8m y_offsetm
+
+ ;Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, second_pred, second_stride, \
+ height, sse
+ %define block_height dword heightm
+ %define second_str second_stridemp
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, ref, ref_stride, \
+ height, sse
+ %define block_height heightd
+ %endif
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+%if %1 == 4
+ %define movx movd
+%else
+ %define movx movh
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+ ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+ ; could perhaps use it for something more productive then
+ pxor m5, m5 ; dedicated zero register
+%if %1 < 16
+ sar block_height, 1
+%if %2 == 1 ; avg
+ shl second_str, 1
+%endif
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ mova m1, [refq]
+%if %2 == 1 ; avg
+ pavgb m0, [second_predq]
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+
+%if %2 == 0 ; !avg
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m0, [srcq+src_strideq]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
+%endif
+%else ; !avg
+ movx m2, [srcq+src_strideq]
+%endif
+
+ movx m1, [refq]
+ movx m3, [refq+ref_strideq]
+
+%if %2 == 1 ; avg
+%if %1 > 4
+ pavgb m0, [second_predq]
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%if %1 > 4
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET %1
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [refq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [second_predq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m2, [srcq+src_strideq*2]
+%else ; 4xh
+ movx m1, [srcq+src_strideq*2]
+ punpckldq m2, m1
+%endif
+ movx m1, [refq]
+%if %1 > 4
+ movlhps m0, m2
+%else ; 4xh
+ punpckldq m0, m2
+%endif
+ movx m3, [refq+ref_strideq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpcklbw m3, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m4, [second_predq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m4, [srcq+src_strideq*2]
+ movx m1, [refq]
+ pavgb m0, m2
+ movx m3, [refq+ref_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_zero_y_half_loop
+ STORE_AND_RET %1
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [refq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq*2]
+ movx m3, [refq+ref_strideq]
+%if cpuflag(ssse3)
+ movx m1, [refq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movx m1, [refq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonzero:
+ cmp x_offsetd, 4
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [refq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [second_predq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m4, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m0, [srcq+src_strideq]
+ movhps m4, [srcq+src_strideq+1]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
+ movx m2, [srcq+src_strideq+1]
+ punpckldq m4, m2
+%endif
+ movx m1, [refq]
+ movx m3, [refq+ref_strideq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m2, [srcq+src_strideq]
+ movx m1, [refq]
+ pavgb m0, m4
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [refq+ref_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_half_y_zero_loop
+ STORE_AND_RET %1
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [refq]
+ pavgb m4, m3
+ punpckhbw m3, m1, m5
+ pavgb m0, m4
+%if %2 == 1 ; avg
+ punpcklbw m1, m5
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movx m2, [srcq]
+ movx m3, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m2, [srcq+src_strideq]
+ movhps m3, [srcq+src_strideq+1]
+%else
+ movx m1, [srcq+src_strideq]
+ punpckldq m2, m1
+ movx m1, [srcq+src_strideq+1]
+ punpckldq m3, m1
+%endif
+ pavgb m2, m3
+%if %1 > 4
+ movlhps m0, m2
+ movhlps m4, m2
+%else ; 4xh
+ punpckldq m0, m2
+ pshuflw m4, m2, 0xe
+%endif
+ movx m1, [refq]
+ pavgb m0, m2
+ movx m3, [refq+ref_strideq]
+%if %1 > 4
+ pavgb m0, [second_predq]
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%if %1 > 4
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m4, [srcq+src_strideq]
+ movx m1, [srcq+src_strideq+1]
+ pavgb m2, m3
+ pavgb m4, m1
+ pavgb m0, m2
+ pavgb m2, m4
+ movx m1, [refq]
+ movx m3, [refq+ref_strideq]
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_half_y_half_loop
+ STORE_AND_RET %1
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ;x86_32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_other_loop:
+ movu m4, [srcq]
+ movu m2, [srcq+1]
+ mova m1, [refq]
+ pavgb m4, m2
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ punpcklbw m0, m5
+ paddw m2, m3
+ punpcklbw m3, m4, m5
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+%endif
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+%if notcpuflag(ssse3)
+ punpcklbw m0, m5
+%endif
+.x_half_y_other_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
+ pavgb m2, m1
+ pavgb m4, m3
+ movx m3, [refq+ref_strideq]
+%if cpuflag(ssse3)
+ movx m1, [refq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ paddw m0, m1
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m2, m1
+ movx m1, [refq]
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [refq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [refq+ref_strideq]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ movx m1, [refq]
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_x_a
+ pmaddubsw m2, filter_x_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movx m1, [refq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+ add srcq, src_strideq
+ packuswb m0, m2
+.x_other_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+%if cpuflag(ssse3)
+ mova m1, [refq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%else
+ punpckhbw m2, m4, m5
+ punpckhbw m1, m3, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ paddw m4, m3
+ paddw m2, m1
+ mova m1, [refq]
+ psraw m4, 4
+ psraw m2, 4
+ punpckhbw m3, m1, m5
+ ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+ ; have a 1-register shortage to be able to store the backup of the bilin
+ ; filtered second line as words as cache for the next line. Packing into
+ ; a byte costs 1 pack and 2 unpacks, but saves a register.
+ packuswb m4, m2
+ punpcklbw m1, m5
+ pavgb m0, m4
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ pavgb m0, [second_predq]
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ add srcq, src_strideq
+ psraw m0, 4
+.x_other_y_half_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movx m1, [refq]
+ movx m3, [refq+ref_strideq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ movx m1, [refq]
+ paddw m4, m3
+ movx m3, [refq+ref_strideq]
+%endif
+ psraw m2, 4
+ psraw m4, 4
+ pavgw m0, m2
+ pavgw m2, m4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf_y_nonhalf:
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m11, [bilin_filter+y_offsetq+16]
+%endif
+ mova m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+ packuswb m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [refq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ punpckhbw m3, m1, m5
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ punpcklbw m1, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+ psraw m0, 4
+%else
+ movu m3, [srcq]
+ movu m4, [srcq+1]
+ punpckhbw m1, m3, m5
+ punpckhbw m2, m4, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ pmullw m3, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m3, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m1, filter_rnd
+ paddw m3, m4
+ paddw m1, m2
+ psraw m3, 4
+ psraw m1, 4
+ packuswb m4, m3, m1
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ pmullw m2, filter_y_a
+ pmullw m1, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, m1
+ mova m1, [refq]
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ psraw m0, 4
+%if cpuflag(ssse3)
+ packuswb m0, m0
+%endif
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+
+ INC_SRC_BY_SRC_STRIDE
+ movx m4, [srcq]
+ movx m3, [srcq+1]
+
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movx m3, [refq+ref_strideq]
+ movx m1, [refq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m2, m2
+ packuswb m4, m4
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m1, m5
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ paddw m4, m3
+ psraw m2, 4
+ psraw m4, 4
+ pmullw m0, filter_y_a
+ pmullw m3, m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m0, m3
+ movx m3, [refq+ref_strideq]
+ paddw m2, m1
+ movx m1, [refq]
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+%undef movx
+ STORE_AND_RET %1
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 4
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE 4
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 4, 1
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE 4, 1
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c
new file mode 100644
index 0000000000..4849581ed4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr,
+ const uint8_t *src_ptr,
+ const uint8_t *pred_ptr) {
+ const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+ const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+ const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+ const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static VPX_FORCE_INLINE void subtract_block_16xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr);
+ const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr);
+ const __m256i s_0 = _mm256_cvtepu8_epi16(s);
+ const __m256i p_0 = _mm256_cvtepu8_epi16(p);
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void subtract_block_32xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void subtract_block_64xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ switch (cols) {
+ case 16:
+ subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 32:
+ subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 64:
+ subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ default:
+ vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride,
+ const uint8_t *src8_ptr,
+ ptrdiff_t src_stride,
+ const uint8_t *pred8_ptr,
+ ptrdiff_t pred_stride, int bd) {
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr);
+ (void)bd;
+ if (cols == 64) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+ const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32));
+ const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+ const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32));
+ const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ const __m256i d2 = _mm256_sub_epi16(s2, p2);
+ const __m256i d3 = _mm256_sub_epi16(s3, p3);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ } while (--j != 0);
+ } else if (cols == 32) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ } while (--j != 0);
+ } else if (cols == 16) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 =
+ _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 =
+ _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ } else if (cols == 8) {
+ int j = rows;
+ do {
+ const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr);
+ const __m128i s1 =
+ _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride));
+ const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr);
+ const __m128i p1 =
+ _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride));
+ const __m128i d0 = _mm_sub_epi16(s0, p0);
+ const __m128i d1 = _mm_sub_epi16(s1, p1);
+ _mm_storeu_si128((__m128i *)diff_ptr, d0);
+ _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ } else {
+ int j = rows;
+ assert(cols == 4);
+ do {
+ const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ const __m128i s1 =
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr);
+ const __m128i p1 =
+ _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride));
+ const __m128i d0 = _mm_sub_epi16(s0, p0);
+ const __m128i d1 = _mm_sub_epi16(s1, p1);
+ _mm_storel_epi64((__m128i *)diff_ptr, d0);
+ _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm
new file mode 100644
index 0000000000..4273efb854
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm
@@ -0,0 +1,127 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void vpx_subtract_block(int rows, int cols,
+; int16_t *diff, ptrdiff_t diff_stride,
+; const uint8_t *src, ptrdiff_t src_stride,
+; const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+ rows, cols, diff, diff_stride, src, src_stride, \
+ pred, pred_stride
+%define pred_str colsq
+ pxor m7, m7 ; dedicated zero register
+ cmp colsd, 4
+ je .case_4
+ cmp colsd, 8
+ je .case_8
+ cmp colsd, 16
+ je .case_16
+ cmp colsd, 32
+ je .case_32
+
+%macro loop16 6
+ mova m0, [srcq+%1]
+ mova m4, [srcq+%2]
+ mova m1, [predq+%3]
+ mova m5, [predq+%4]
+ punpckhbw m2, m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ psubw m2, m3
+ psubw m0, m1
+ punpckhbw m1, m4, m7
+ punpckhbw m3, m5, m7
+ punpcklbw m4, m7
+ punpcklbw m5, m7
+ psubw m1, m3
+ psubw m4, m5
+ mova [diffq+mmsize*0+%5], m0
+ mova [diffq+mmsize*1+%5], m2
+ mova [diffq+mmsize*0+%6], m4
+ mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+ mov pred_str, pred_stridemp
+.loop_64:
+ loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+ loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_64
+ RET
+
+.case_32:
+ mov pred_str, pred_stridemp
+.loop_32:
+ loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_32
+ RET
+
+.case_16:
+ mov pred_str, pred_stridemp
+.loop_16:
+ loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+ lea diffq, [diffq+diff_strideq*4]
+ lea predq, [predq+pred_str*2]
+ lea srcq, [srcq+src_strideq*2]
+ sub rowsd, 2
+ jg .loop_16
+ RET
+
+%macro loop_h 0
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+ movh m1, [predq]
+ movh m3, [predq+pred_str]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ psubw m0, m1
+ psubw m2, m3
+ mova [diffq], m0
+ mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+ mov pred_str, pred_stridemp
+.loop_8:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_8
+ RET
+
+INIT_MMX
+.case_4:
+ mov pred_str, pred_stridemp
+.loop_4:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_4
+ RET
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c
new file mode 100644
index 0000000000..df6514b2c4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
+ // Over 75% of all calls are with size == 4.
+ if (size == 4) {
+ __m128i s[2], sq[2], ss;
+
+ s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
+ s[0] = loadh_epi64(s[0], src + 1 * stride);
+ s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
+ s[1] = loadh_epi64(s[1], src + 3 * stride);
+ sq[0] = _mm_madd_epi16(s[0], s[0]);
+ sq[1] = _mm_madd_epi16(s[1], s[1]);
+ sq[0] = _mm_add_epi32(sq[0], sq[1]);
+ ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8));
+ ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32));
+
+ return (uint64_t)_mm_cvtsi128_si32(ss);
+ } else {
+ // Generic case
+ int r = size;
+ const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1);
+ __m128i v_acc_q = _mm_setzero_si128();
+
+ assert(size % 8 == 0);
+
+ do {
+ int c = 0;
+ __m128i v_acc_d = _mm_setzero_si128();
+
+ do {
+ const int16_t *const b = src + c;
+ const __m128i v_val_0_w =
+ _mm_load_si128((const __m128i *)(b + 0 * stride));
+ const __m128i v_val_1_w =
+ _mm_load_si128((const __m128i *)(b + 1 * stride));
+ const __m128i v_val_2_w =
+ _mm_load_si128((const __m128i *)(b + 2 * stride));
+ const __m128i v_val_3_w =
+ _mm_load_si128((const __m128i *)(b + 3 * stride));
+ const __m128i v_val_4_w =
+ _mm_load_si128((const __m128i *)(b + 4 * stride));
+ const __m128i v_val_5_w =
+ _mm_load_si128((const __m128i *)(b + 5 * stride));
+ const __m128i v_val_6_w =
+ _mm_load_si128((const __m128i *)(b + 6 * stride));
+ const __m128i v_val_7_w =
+ _mm_load_si128((const __m128i *)(b + 7 * stride));
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+ const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+ const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+ const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+ const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+ const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+ const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+ const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
+ c += 8;
+ } while (c < size);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+ src += 8 * stride;
+ r -= 8;
+ } while (r);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if VPX_ARCH_X86_64
+ return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+ {
+ uint64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
+ return tmp;
+ }
+#endif
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h
new file mode 100644
index 0000000000..b4f1190d74
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
+#define VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+ // Unpack 16 bit elements resulting in:
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+ // Unpack 16 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+ // Unpack 32 bit elements resulting in:
+ // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+ const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+ const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+ const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(c0, c0);
+ out[1] = _mm_unpackhi_epi64(c0, c0);
+ out[2] = _mm_unpacklo_epi64(c1, c1);
+ out[3] = _mm_unpackhi_epi64(c1, c1);
+ out[4] = _mm_unpacklo_epi64(c2, c2);
+ out[5] = _mm_unpackhi_epi64(c2, c2);
+ out[6] = _mm_unpacklo_epi64(c3, c3);
+ out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // out[0]: 00 10 20 30 01 11 21 31
+ // out[1]: 02 12 22 32 03 13 23 33
+ out[0] = _mm_unpacklo_epi32(a0, a1);
+ out[1] = _mm_unpackhi_epi32(a0, a1);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // in[4]: 40 41 42 43 XX XX XX XX
+ // in[5]: 50 51 52 53 XX XX XX XX
+ // in[6]: 60 61 62 63 XX XX XX XX
+ // in[7]: 70 71 72 73 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 02 12 22 32 03 13 23 33
+ // b3: 42 52 62 72 43 53 63 73
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b2, b3);
+ out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ // a6: 44 54 45 55 46 56 47 57
+ // a7: 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 04 14 24 34 05 15 25 35
+ // b3: 44 54 64 74 45 55 65 75
+ // b4: 02 12 22 32 03 13 23 33
+ // b5: 42 52 62 72 43 53 63 73
+ // b6: 06 16 26 36 07 17 27 37
+ // b7: 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+ const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b4, b5);
+ out[3] = _mm_unpackhi_epi64(b4, b5);
+ out[4] = _mm_unpacklo_epi64(b2, b3);
+ out[5] = _mm_unpackhi_epi64(b2, b3);
+ out[6] = _mm_unpacklo_epi64(b6, b7);
+ out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+ __m128i *const right) {
+ __m128i tbuf[8];
+ transpose_16bit_8x8(left, left);
+ transpose_16bit_8x8(right, tbuf);
+ transpose_16bit_8x8(left + 8, right);
+ transpose_16bit_8x8(right + 8, right + 8);
+
+ left[8] = tbuf[0];
+ left[9] = tbuf[1];
+ left[10] = tbuf[2];
+ left[11] = tbuf[3];
+ left[12] = tbuf[4];
+ left[13] = tbuf[5];
+ left[14] = tbuf[6];
+ left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // in[4]: 04 05 06 07
+ // in[5]: 14 15 16 17
+ // in[6]: 24 25 26 27
+ // in[7]: 34 35 36 37
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+ // a4: 04 14 05 15
+ // a5: 24 34 25 35
+ // a6: 06 16 07 17
+ // a7: 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+ const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+ const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+ const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+ out[4] = _mm_unpacklo_epi64(a4, a5);
+ out[5] = _mm_unpackhi_epi64(a4, a5);
+ out[6] = _mm_unpacklo_epi64(a6, a7);
+ out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 04 05 06 07
+ // in[2]: 10 11 12 13
+ // in[3]: 14 15 16 17
+ // in[4]: 20 21 22 23
+ // in[5]: 24 25 26 27
+ // in[6]: 30 31 32 33
+ // in[7]: 34 35 36 37
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+ // a4: 04 14 05 15
+ // a5: 24 34 25 35
+ // a6: 06 16 07 17
+ // a7: 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+ const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+ const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+ const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+ const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+ out[4] = _mm_unpacklo_epi64(a4, a5);
+ out[5] = _mm_unpackhi_epi64(a4, a5);
+ out[6] = _mm_unpacklo_epi64(a6, a7);
+ out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif // VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h
new file mode 100644
index 0000000000..de5ce43b00
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+#define VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+
+#include <emmintrin.h>
+#include "vpx/vpx_integer.h"
+
+#define pair_set_epi16(a, b) \
+ _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+#define pair_set_epi32(a, b) \
+ _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
+#define dual_set_epi16(a, b) \
+ _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
+ (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
+
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
+#endif // VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c
new file mode 100644
index 0000000000..8305b9f20f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c
@@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+};
+
+DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = {
+ 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1,
+ 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1
+};
+/* clang-format on */
+
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+ __m256i *const sse,
+ __m256i *const sum) {
+ const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2);
+
+ // unpack into pairs of source and reference values
+ const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+ const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
+
+ // subtract adjacent elements using src*1 + ref*-1
+ const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+ const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+ const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+ const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+ // add to the running totals
+ *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+ *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
+}
+
+static INLINE void variance_final_from_32bit_sum_avx2(__m256i vsse,
+ __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ // extract the low lane and add it to the high lane
+ const __m128i sse_reg_128 = _mm_add_epi32(_mm256_castsi256_si128(vsse),
+ _mm256_extractf128_si256(vsse, 1));
+
+ // unpack sse and sum registers and add
+ const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+ const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+ const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+ // perform the final summation and extract the results
+ const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+ *((int *)sse) = _mm_cvtsi128_si32(res);
+ *((int *)sum) = _mm_extract_epi32(res, 1);
+}
+
+static INLINE void variance_final_from_16bit_sum_avx2(__m256i vsse,
+ __m256i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ // extract the low lane and add it to the high lane
+ const __m128i sum_reg_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum),
+ _mm256_extractf128_si256(vsum, 1));
+ const __m128i sum_reg_64 =
+ _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
+ const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
+
+ variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse, sum);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+ const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+ const __m256i sum_hi =
+ _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+ return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+static INLINE void variance8_kernel_avx2(
+ const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+ const int ref_stride, __m256i *const sse, __m256i *const sum) {
+ __m128i src0, src1, ref0, ref1;
+ __m256i ss, rr, diff;
+
+ // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00
+ src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride));
+
+ // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10
+ src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride));
+
+ // s17 s16...s11 s10 s07 s06...s01 s00 (8bit)
+ src0 = _mm_unpacklo_epi64(src0, src1);
+
+ // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit)
+ ss = _mm256_cvtepu8_epi16(src0);
+
+ // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00
+ ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride));
+
+ // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10
+ ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride));
+
+ // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit)
+ ref0 = _mm_unpacklo_epi64(ref0, ref1);
+
+ // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit)
+ rr = _mm256_cvtepu8_epi16(ref0);
+
+ diff = _mm256_sub_epi16(ss, rr);
+ *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff));
+ *sum = _mm256_add_epi16(*sum, diff);
+}
+
+static INLINE void variance16_kernel_avx2(
+ const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+ const int ref_stride, __m256i *const sse, __m256i *const sum) {
+ const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+ const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+ const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+ const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+ variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+ const uint8_t *const ref,
+ __m256i *const sse,
+ __m256i *const sum) {
+ const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+ const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+ variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance8_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ int i;
+ *vsum = _mm256_setzero_si256();
+ *vsse = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i += 2) {
+ variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ }
+}
+
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ int i;
+ *vsum = _mm256_setzero_si256();
+ *vsse = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i += 2) {
+ variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ }
+}
+
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ int i;
+ *vsum = _mm256_setzero_si256();
+ *vsse = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i++) {
+ variance32_kernel_avx2(src, ref, vsse, vsum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ int i;
+ *vsum = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i++) {
+ variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+ variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, sum);
+}
+
+#define FILTER_SRC(filter) \
+ /* filter the source */ \
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+ \
+ /* add 8 to source */ \
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+ \
+ /* divide source by 16 */ \
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+ /* expand each byte to 2 bytes */ \
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+ /* source - dest */ \
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+ /* caculate sum */ \
+ *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo); \
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+ *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi); \
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+ /* calculate sse */ \
+ *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo); \
+ *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+ res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+ sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+ sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+ \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+ \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src);
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ second_pred += second_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+// (x == 0, y == 4) or (x == 4, y == 0). sstep determines the direction.
+static INLINE void spv32_half_zero(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred,
+ int second_stride, int do_sec, int height,
+ __m256i *sum_reg, __m256i *sse_reg,
+ int sstep) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+ const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ second_pred += second_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg) {
+ spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, sum_reg, sse_reg, src_stride);
+}
+
+static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg) {
+ spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, sum_reg, sse_reg, 1);
+}
+
+static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ src += src_stride;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src));
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+ const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg);
+ prev_src_avg = src_avg;
+
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ second_pred += second_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg);
+ }
+ // save current source average
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+// (x == 0, y == bil) or (x == 4, y == bil). sstep determines the direction.
+static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred,
+ int second_stride, int do_sec, int height,
+ __m256i *sum_reg, __m256i *sse_reg,
+ int offset, int sstep) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (offset << 5)));
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+ exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+ exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+ FILTER_SRC(filter)
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+ second_pred += second_stride;
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int y_offset) {
+ spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, sum_reg, sse_reg, y_offset, src_stride);
+}
+
+static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int x_offset) {
+ spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, sum_reg, sse_reg, x_offset, 1);
+}
+
+static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int y_offset) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ src += src_stride;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+ exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg);
+ exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg);
+ prev_src_avg = src_avg;
+
+ FILTER_SRC(filter)
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ second_pred += second_stride;
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int x_offset) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i src_reg, src_pack;
+ int i;
+ exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+ exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+ FILTER_SRC(filter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+ src += src_stride;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+ exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+ FILTER_SRC(filter)
+
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // average between previous pack to the current
+ src_pack = _mm256_avg_epu8(src_pack, src_reg);
+
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg);
+ second_pred += second_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src_pack = src_reg;
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int x_offset, int y_offset) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i xfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+ const __m256i yfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i prev_src_pack, src_pack;
+ int i;
+ exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+ exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+ FILTER_SRC(xfilter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ src += src_stride;
+
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+ exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+ FILTER_SRC(xfilter)
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+ // merge previous pack to current pack source
+ exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack);
+ exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack);
+
+ FILTER_SRC(yfilter)
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ second_pred += second_stride;
+ }
+
+ prev_src_pack = src_pack;
+
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, unsigned int *sse) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ __m256i sum_reg = _mm256_setzero_si256();
+ __m256i sse_reg = _mm256_setzero_si256();
+ __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+ int sum;
+ // x_offset = 0 and y_offset = 0
+ if (x_offset == 0) {
+ if (y_offset == 0) {
+ spv32_x0_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg);
+ // x_offset = 0 and y_offset = 4
+ } else if (y_offset == 4) {
+ spv32_x0_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg);
+ // x_offset = 0 and y_offset = bilin interpolation
+ } else {
+ spv32_x0_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg, y_offset);
+ }
+ // x_offset = 4 and y_offset = 0
+ } else if (x_offset == 4) {
+ if (y_offset == 0) {
+ spv32_x4_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg);
+ // x_offset = 4 and y_offset = 4
+ } else if (y_offset == 4) {
+ spv32_x4_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg);
+ // x_offset = 4 and y_offset = bilin interpolation
+ } else {
+ spv32_x4_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg, y_offset);
+ }
+ // x_offset = bilin interpolation and y_offset = 0
+ } else {
+ if (y_offset == 0) {
+ spv32_xb_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg, x_offset);
+ // x_offset = bilin interpolation and y_offset = 4
+ } else if (y_offset == 4) {
+ spv32_xb_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg, x_offset);
+ // x_offset = bilin interpolation and y_offset = bilin interpolation
+ } else {
+ spv32_xb_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg, x_offset, y_offset);
+ }
+ }
+ CALC_SUM_AND_SSE
+ return sum;
+}
+
+static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height, unsigned int *sse) {
+ return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+ NULL, 0, 0, height, sse);
+}
+
+static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred,
+ int second_stride, int height,
+ unsigned int *sse) {
+ return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+ second_pred, second_stride, 1, height, sse);
+}
+
+typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum);
+
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse, vsum;
+ int sum;
+ variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse, vsum;
+ int sum;
+ variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse, vsum;
+ int sum;
+ variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
+}
+
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ __m128i vsum_128;
+ variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+ vsum_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum),
+ _mm256_extractf128_si256(vsum, 1));
+ vsum_128 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+ _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+ variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ __m128i vsum_128;
+ variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, &vsse, &vsum);
+ vsum = sum_to_32bit_avx2(vsum);
+ vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+ _mm256_extractf128_si256(vsum, 1));
+ variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse = _mm256_setzero_si256();
+ __m256i vsum = _mm256_setzero_si256();
+ __m128i vsum_128;
+ int sum;
+ variance64_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+ vsum = sum_to_32bit_avx2(vsum);
+ vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+ _mm256_extractf128_si256(vsum, 1));
+ variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse = _mm256_setzero_si256();
+ __m256i vsum = _mm256_setzero_si256();
+ __m128i vsum_128;
+ int sum;
+ int i = 0;
+
+ for (i = 0; i < 2; i++) {
+ __m256i vsum16;
+ variance64_avx2(src_ptr + 32 * i * src_stride, src_stride,
+ ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
+ &vsum16);
+ vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));
+ }
+ vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+ _mm256_extractf128_si256(vsum, 1));
+ variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse;
+}
+
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse;
+}
+
+unsigned int vpx_sub_pixel_variance64x64_avx2(
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {
+ unsigned int sse1;
+ const int se1 = sub_pixel_variance32xh_avx2(
+ src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 64, &sse1);
+ unsigned int sse2;
+ const int se2 =
+ sub_pixel_variance32xh_avx2(src_ptr + 32, src_stride, x_offset, y_offset,
+ ref_ptr + 32, ref_stride, 64, &sse2);
+ const int se = se1 + se2;
+ *sse = sse1 + sse2;
+ return *sse - (uint32_t)(((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_variance32x32_avx2(
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {
+ const int se = sub_pixel_variance32xh_avx2(
+ src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 32, sse);
+ return *sse - (uint32_t)(((int64_t)se * se) >> 10);
+}
+
+unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+ const uint8_t *second_pred) {
+ unsigned int sse1;
+ const int se1 = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset,
+ y_offset, ref_ptr, ref_stride,
+ second_pred, 64, 64, &sse1);
+ unsigned int sse2;
+ const int se2 = sub_pixel_avg_variance32xh_avx2(
+ src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, ref_stride,
+ second_pred + 32, 64, 64, &sse2);
+ const int se = se1 + se2;
+
+ *sse = sse1 + sse2;
+
+ return *sse - (uint32_t)(((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+ const uint8_t *second_pred) {
+ // Process 32 elements in parallel.
+ const int se = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset,
+ y_offset, ref_ptr, ref_stride,
+ second_pred, 32, 32, sse);
+ return *sse - (uint32_t)(((int64_t)se * se) >> 10);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c
new file mode 100644
index 0000000000..d6eb12da1a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+ val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+ val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+ return (unsigned int)_mm_cvtsi128_si32(val);
+}
+
+unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
+ __m128i vsum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 32; ++i) {
+ const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr);
+ vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+ src_ptr += 8;
+ }
+
+ return add32x4_sse2(vsum);
+}
+
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+ const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+ const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
+ const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
+ return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
+}
+
+static INLINE void variance_kernel_sse2(const __m128i src_ptr,
+ const __m128i ref_ptr,
+ __m128i *const sse,
+ __m128i *const sum) {
+ const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr);
+ *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+ *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+}
+
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+ *sum += (int16_t)_mm_extract_epi16(vsum, 1);
+}
+
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_unpacklo_epi16(vsum, vsum);
+ vsum = _mm_srai_epi32(vsum, 16);
+ *sum = (int)add32x4_sse2(vsum);
+}
+
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+ const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+ const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+ return _mm_add_epi32(sum_lo, sum_hi);
+}
+
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE int sum_final_sse2(const __m128i sum) {
+ const __m128i t = sum_to_32bit_sse2(sum);
+ return (int)add32x4_sse2(t);
+}
+
+static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
+ const uint8_t *ref_ptr, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ int i;
+
+ assert(h <= 256); // May overflow for larger height.
+ *sse = _mm_setzero_si128();
+ *sum = _mm_setzero_si128();
+
+ for (i = 0; i < h; i += 2) {
+ const __m128i s = load4x2_sse2(src_ptr, src_stride);
+ const __m128i r = load4x2_sse2(ref_ptr, ref_stride);
+
+ variance_kernel_sse2(s, r, sse, sum);
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ }
+}
+
+static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride,
+ const uint8_t *ref_ptr, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ const __m128i zero = _mm_setzero_si128();
+ int i;
+
+ assert(h <= 128); // May overflow for larger height.
+ *sse = _mm_setzero_si128();
+ *sum = _mm_setzero_si128();
+
+ for (i = 0; i < h; i++) {
+ const __m128i s =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero);
+ const __m128i r =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero);
+
+ variance_kernel_sse2(s, r, sse, sum);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr,
+ const uint8_t *const ref_ptr,
+ __m128i *const sse,
+ __m128i *const sum) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr);
+ const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr);
+ const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+ const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+ const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+ variance_kernel_sse2(src0, ref0, sse, sum);
+ variance_kernel_sse2(src1, ref1, sse, sum);
+}
+
+static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride,
+ const uint8_t *ref_ptr, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ int i;
+
+ assert(h <= 64); // May overflow for larger height.
+ *sse = _mm_setzero_si128();
+ *sum = _mm_setzero_si128();
+
+ for (i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride,
+ const uint8_t *ref_ptr, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ int i;
+
+ assert(h <= 32); // May overflow for larger height.
+ // Don't initialize sse here since it's an accumulation.
+ *sum = _mm_setzero_si128();
+
+ for (i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
+ variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride,
+ const uint8_t *ref_ptr, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ int i;
+
+ assert(h <= 16); // May overflow for larger height.
+ // Don't initialize sse here since it's an accumulation.
+ *sum = _mm_setzero_si128();
+
+ for (i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
+ variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
+ variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum);
+ variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ __m128i vsse, vsum;
+ variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, sum);
+}
+
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ __m128i vsse, vsum;
+ variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_256_pel_sse2(vsse, vsum, sse, sum);
+}
+
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 4);
+}
+
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_256_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
+}
+
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+ variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse = _mm_setzero_si128();
+ __m128i vsum;
+ int sum;
+ variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse = _mm_setzero_si128();
+ __m128i vsum;
+ int sum;
+ variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+ *sse = add32x4_sse2(vsse);
+ sum = sum_final_sse2(vsum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ int sum;
+ int i = 0;
+
+ for (i = 0; i < 2; i++) {
+ __m128i vsum16;
+ variance32_sse2(src_ptr + 32 * i * src_stride, src_stride,
+ ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
+ &vsum16);
+ vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+ }
+ *sse = add32x4_sse2(vsse);
+ sum = (int)add32x4_sse2(vsum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ int sum;
+ int i = 0;
+
+ for (i = 0; i < 2; i++) {
+ __m128i vsum16;
+ variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
+ ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
+ &vsum16);
+ vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+ }
+ *sse = add32x4_sse2(vsse);
+ sum = (int)add32x4_sse2(vsum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ int sum;
+ int i = 0;
+
+ for (i = 0; i < 4; i++) {
+ __m128i vsum16;
+ variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
+ ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
+ &vsum16);
+ vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+ }
+ *sse = add32x4_sse2(vsse);
+ sum = (int)add32x4_sse2(vsum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
+ return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt) \
+ int vpx_sub_pixel_variance##w##xh_##opt( \
+ const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \
+ int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \
+ unsigned int *sse, void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+ DECL(4, opt1); \
+ DECL(8, opt1); \
+ DECL(16, opt1)
+
+DECLS(sse2, sse2);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \
+ unsigned int sse_tmp; \
+ int se = vpx_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \
+ &sse_tmp, NULL, NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \
+ ref_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \
+ ref_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \
+ ref_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ } \
+ } \
+ *sse = sse_tmp; \
+ return sse_tmp - \
+ (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#define FNS(opt1, opt2) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
+
+FNS(sse2, sse2)
+FNS(ssse3, ssse3)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+ int vpx_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \
+ int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, \
+ const uint8_t *second_pred, ptrdiff_t second_stride, int height, \
+ unsigned int *sse, void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+ DECL(4, opt1); \
+ DECL(8, opt1); \
+ DECL(16, opt1)
+
+DECLS(sse2, sse2);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ unsigned int sse_tmp; \
+ int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, \
+ second_pred, w, h, &sse_tmp, NULL, NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \
+ ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \
+ ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \
+ ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ } \
+ } \
+ *sse = sse_tmp; \
+ return sse_tmp - \
+ (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#define FNS(opt1, opt2) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
+
+FNS(sse2, sse)
+FNS(ssse3, ssse3)
+
+#undef FNS
+#undef FN
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
new file mode 100644
index 0000000000..3f444e2e6a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -0,0 +1,226 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1-2
+%ifidn %1, avg
+%define AUX_XMM_REGS 4
+%else
+%define AUX_XMM_REGS 0
+%endif
+%ifidn %2, highbd
+%define pavg pavgw
+cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
+ dst, dst_stride, \
+ f, fxo, fxs, fyo, fys, w, h, bd
+%else
+%define pavg pavgb
+cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
+ dst, dst_stride, \
+ f, fxo, fxs, fyo, fys, w, h
+%endif
+ mov r4d, dword wm
+%ifidn %2, highbd
+ shl r4d, 1
+ shl src_strideq, 1
+ shl dst_strideq, 1
+%else
+ cmp r4d, 4
+ je .w4
+%endif
+ cmp r4d, 8
+ je .w8
+ cmp r4d, 16
+ je .w16
+ cmp r4d, 32
+ je .w32
+%ifidn %2, highbd
+ cmp r4d, 64
+ je .w64
+
+ mov r4d, dword hm
+.loop128:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ movu m0, [srcq+64]
+ movu m1, [srcq+80]
+ movu m2, [srcq+96]
+ movu m3, [srcq+112]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq+64]
+ pavg m1, [dstq+80]
+ pavg m2, [dstq+96]
+ pavg m3, [dstq+112]
+%endif
+ mova [dstq+64], m0
+ mova [dstq+80], m1
+ mova [dstq+96], m2
+ mova [dstq+112], m3
+ add dstq, dst_strideq
+ dec r4d
+ jnz .loop128
+ RET
+%endif
+
+.w64:
+ mov r4d, dword hm
+.loop64:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, dst_strideq
+ dec r4d
+ jnz .loop64
+ RET
+
+.w32:
+ mov r4d, dword hm
+.loop32:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+src_strideq]
+ movu m3, [srcq+src_strideq+16]
+ lea srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq +16]
+ pavg m2, [dstq+dst_strideq]
+ pavg m3, [dstq+dst_strideq+16]
+%endif
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+dst_strideq ], m2
+ mova [dstq+dst_strideq+16], m3
+ lea dstq, [dstq+dst_strideq*2]
+ sub r4d, 2
+ jnz .loop32
+ RET
+
+.w16:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop16:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+dst_strideq]
+ pavg m2, [dstq+dst_strideq*2]
+ pavg m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop16
+ RET
+
+.w8:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop8:
+ movh m0, [srcq]
+ movh m1, [srcq+src_strideq]
+ movh m2, [srcq+src_strideq*2]
+ movh m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ movh m4, [dstq]
+ movh m5, [dstq+dst_strideq]
+ movh m6, [dstq+dst_strideq*2]
+ movh m7, [dstq+r6q]
+ pavg m0, m4
+ pavg m1, m5
+ pavg m2, m6
+ pavg m3, m7
+%endif
+ movh [dstq ], m0
+ movh [dstq+dst_strideq ], m1
+ movh [dstq+dst_strideq*2], m2
+ movh [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop8
+ RET
+
+%ifnidn %2, highbd
+.w4:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop4:
+ movd m0, [srcq]
+ movd m1, [srcq+src_strideq]
+ movd m2, [srcq+src_strideq*2]
+ movd m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ movd m4, [dstq]
+ movd m5, [dstq+dst_strideq]
+ movd m6, [dstq+dst_strideq*2]
+ movd m7, [dstq+r6q]
+ pavg m0, m4
+ pavg m1, m5
+ pavg m2, m6
+ pavg m3, m7
+%endif
+ movd [dstq ], m0
+ movd [dstq+dst_strideq ], m1
+ movd [dstq+dst_strideq*2], m2
+ movd [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop4
+ RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+convolve_fn copy
+convolve_fn avg
+%if CONFIG_VP9_HIGHBITDEPTH
+convolve_fn copy, highbd
+convolve_fn avg, highbd
+%endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..fc301fb39e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -0,0 +1,964 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+ punpcklwd xmm1, xmm7
+
+ movdqa k0k6, xmm0
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+ movdqa k1k7, xmm1
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bd
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+ punpcklwd xmm0, xmm6 ;two row in one register
+ punpcklwd xmm1, xmm7
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+
+ pmaddwd xmm0, k0k6 ;multiply the filter factors
+ pmaddwd xmm1, k1k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm3, k3k4
+
+ paddd xmm0, xmm1 ;sum
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+
+ paddd xmm0, krd ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movq [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm6, xmm7
+ punpckhwd xmm2, xmm5
+ punpckhwd xmm3, xmm4
+
+ movdqa k0k1, xmm0 ;store filter factors on stack
+ movdqa k6k7, xmm6
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bd
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+ movdqu xmm0, [rsi + %1] ;0
+ movdqu xmm1, [rsi + rax + %1] ;1
+ movdqu xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movdqu xmm7, [rsi + rdx * 2 + %1] ;7
+ movdqu xmm2, [rsi + rax + %1] ;2
+ movdqu xmm3, [rsi + rax * 2 + %1] ;3
+ movdqu xmm4, [rsi + rdx + %1] ;4
+ movdqu xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+ movdqu temp, xmm4
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm1, xmm6
+ punpcklwd xmm6, xmm7
+ punpckhwd xmm1, xmm7
+ movdqa xmm7, xmm2
+ punpcklwd xmm2, xmm5
+ punpckhwd xmm7, xmm5
+
+ movdqu xmm5, temp
+ movdqu temp, xmm4
+ movdqa xmm4, xmm3
+ punpcklwd xmm3, xmm5
+ punpckhwd xmm4, xmm5
+ movdqu xmm5, temp
+
+ pmaddwd xmm0, k0k1
+ pmaddwd xmm5, k0k1
+ pmaddwd xmm6, k6k7
+ pmaddwd xmm1, k6k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm7, k2k5
+ pmaddwd xmm3, k3k4
+ pmaddwd xmm4, k3k4
+
+ paddd xmm0, xmm6
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+ paddd xmm5, xmm1
+ paddd xmm5, xmm7
+ paddd xmm5, xmm4
+
+ paddd xmm0, krd ;rounding
+ paddd xmm5, krd
+ psrad xmm0, 7 ;shift
+ psrad xmm5, 7
+ packssdw xmm0, xmm5 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movdqu xmm1, [rdi + %2]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void vpx_highbd_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d4_v8_sse2)
+sym(vpx_highbd_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movq xmm0, [rsi] ;load src: row 0
+ movq xmm1, [rsi + rax] ;1
+ movq xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2] ;7
+ movq xmm2, [rsi + rax] ;2
+ movq xmm3, [rsi + rax * 2] ;3
+ movq xmm4, [rsi + rdx] ;4
+ movq xmm5, [rsi + rax * 4] ;5
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_highbd_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d8_v8_sse2)
+sym(vpx_highbd_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_highbd_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d16_v8_sse2)
+sym(vpx_highbd_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 16
+ HIGH_APPLY_FILTER_8 0, 16
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d4_v8_avg_sse2)
+sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movq xmm0, [rsi] ;load src: row 0
+ movq xmm1, [rsi + rax] ;1
+ movq xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2] ;7
+ movq xmm2, [rsi + rax] ;2
+ movq xmm3, [rsi + rax * 2] ;3
+ movq xmm4, [rsi + rdx] ;4
+ movq xmm5, [rsi + rax * 4] ;5
+
+ HIGH_APPLY_FILTER_4 1
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d8_v8_avg_sse2)
+sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 1, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_v8_avg_sse2)
+sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 1, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 16
+ HIGH_APPLY_FILTER_8 1, 16
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_highbd_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d4_h8_sse2)
+sym(vpx_highbd_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm4, [rsi + 2]
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+
+ psrldq xmm1, 2
+ psrldq xmm6, 4
+ psrldq xmm7, 6
+ psrldq xmm2, 4
+ psrldq xmm3, 6
+ psrldq xmm5, 2
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_highbd_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d8_h8_sse2)
+sym(vpx_highbd_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_highbd_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d16_h8_sse2)
+sym(vpx_highbd_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 10] ;load src
+ movdqu xmm1, [rsi + 12]
+ movdqu xmm2, [rsi + 14]
+ movdqu xmm3, [rsi + 16]
+ movdqu xmm4, [rsi + 18]
+ movdqu xmm5, [rsi + 20]
+ movdqu xmm6, [rsi + 22]
+ movdqu xmm7, [rsi + 24]
+
+ HIGH_APPLY_FILTER_8 0, 16
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d4_h8_avg_sse2)
+sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm4, [rsi + 2]
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+
+ psrldq xmm1, 2
+ psrldq xmm6, 4
+ psrldq xmm7, 6
+ psrldq xmm2, 4
+ psrldq xmm3, 6
+ psrldq xmm5, 2
+
+ HIGH_APPLY_FILTER_4 1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d8_h8_avg_sse2)
+sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 1, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_h8_avg_sse2)
+sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 1, 0
+
+ movdqu xmm0, [rsi + 10] ;load src
+ movdqu xmm1, [rsi + 12]
+ movdqu xmm2, [rsi + 14]
+ movdqu xmm3, [rsi + 16]
+ movdqu xmm4, [rsi + 18]
+ movdqu xmm5, [rsi + 20]
+ movdqu xmm6, [rsi + 22]
+ movdqu xmm7, [rsi + 24]
+
+ HIGH_APPLY_FILTER_8 1, 16
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..bd51c75bcb
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -0,0 +1,496 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklwd xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bd
+ movq xmm5, rdx
+ movq xmm2, rcx
+ pshufd xmm5, xmm5, 0b
+ movdqa xmm1, xmm5
+ psllw xmm5, xmm2
+ psubw xmm5, xmm1 ;max value (for clamping)
+ pxor xmm2, xmm2 ;min value (for clamping)
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+ punpcklwd xmm0, xmm1 ;two row in one register
+ pmaddwd xmm0, xmm4 ;multiply the filter factors
+
+ paddd xmm0, xmm3 ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, xmm5
+ pmaxsw xmm0, xmm2
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+
+ movq [rdi], xmm0
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%if VPX_ARCH_X86_64
+%macro HIGH_GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm6, [rdx] ;load filters
+
+ pshuflw xmm7, xmm6, 11111111b ;k3
+ pshufhw xmm6, xmm6, 0b ;k4
+ psrldq xmm6, 8
+ punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bd
+ movq xmm8, rdx
+ movq xmm5, rcx
+ pshufd xmm8, xmm8, 0b
+ movdqa xmm1, xmm8
+ psllw xmm8, xmm5
+ psubw xmm8, xmm1 ;max value (for clamping)
+ pxor xmm5, xmm5 ;min value (for clamping)
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+ movdqa xmm6, xmm0
+ punpckhwd xmm6, xmm1
+ punpcklwd xmm0, xmm1
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+
+ paddd xmm6, xmm4 ;rounding
+ paddd xmm0, xmm4 ;rounding
+ psrad xmm6, 7 ;shift
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, xmm8
+ pmaxsw xmm0, xmm5
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+ movdqa xmm9, xmm0
+ movdqa xmm6, xmm2
+ punpckhwd xmm9, xmm1
+ punpckhwd xmm6, xmm3
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+
+ pmaddwd xmm9, xmm7
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+ pmaddwd xmm2, xmm7
+
+ paddd xmm9, xmm4 ;rounding
+ paddd xmm6, xmm4
+ paddd xmm0, xmm4
+ paddd xmm2, xmm4
+
+ psrad xmm9, 7 ;shift
+ psrad xmm6, 7
+ psrad xmm0, 7
+ psrad xmm2, 7
+
+ packssdw xmm0, xmm9 ;pack back to word
+ packssdw xmm2, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, xmm8
+ pmaxsw xmm0, xmm5
+ pminsw xmm2, xmm8
+ pmaxsw xmm2, xmm5
+
+%if %1
+ movdqu xmm1, [rdi]
+ movdqu xmm3, [rdi + 16]
+ pavgw xmm0, xmm1
+ pavgw xmm2, xmm3
+%endif
+ movdqu [rdi], xmm0 ;store the result
+ movdqu [rdi + 16], xmm2 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+%endif
+
+SECTION .text
+
+globalsym(vpx_highbd_filter_block1d4_v2_sse2)
+sym(vpx_highbd_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movq xmm0, [rsi] ;load src
+ movq xmm1, [rsi + 2*rax]
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_v2_sse2)
+sym(vpx_highbd_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_v2_sse2)
+sym(vpx_highbd_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm1, [rsi + 2*rax] ;1
+ movdqu xmm3, [rsi + 2*rax + 16]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+globalsym(vpx_highbd_filter_block1d4_v2_avg_sse2)
+sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movq xmm0, [rsi] ;load src
+ movq xmm1, [rsi + 2*rax]
+
+ HIGH_APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_v2_avg_sse2)
+sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+
+ HIGH_APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_v2_avg_sse2)
+sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 2*rax + 16]
+
+ HIGH_APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+globalsym(vpx_highbd_filter_block1d4_h2_sse2)
+sym(vpx_highbd_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 2
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_h2_sse2)
+sym(vpx_highbd_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_h2_sse2)
+sym(vpx_highbd_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 18]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+globalsym(vpx_highbd_filter_block1d4_h2_avg_sse2)
+sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 2
+
+ HIGH_APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_h2_avg_sse2)
+sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+
+ HIGH_APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_h2_avg_sse2)
+sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 18]
+
+ HIGH_APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
new file mode 100644
index 0000000000..21a35ae3c3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -0,0 +1,1161 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_ports/mem.h"
+
+#define CONV8_ROUNDING_BITS (7)
+#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
+
+static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+ __m128i dst_first, dst_second;
+ __m128i even, odd;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ for (h = height; h > 0; --h) {
+ // We will load multiple shifted versions of the row and shuffle them into
+ // 16-bit words of the form
+ // ... s[2] s[1] s[0] s[-1]
+ // ... s[4] s[3] s[2] s[1]
+ // Then we call multiply and add to get partial results
+ // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+ // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+ // The two results are then added together for the first half of even
+ // output.
+ // Repeat multiple times to get the whole outoput
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+ src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+ src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+ // Output 6 4 2 0
+ even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+ &kernel_reg_45);
+
+ // Output 7 5 3 1
+ odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Combine to get the first half of the dst
+ dst_first = mm_zip_epi32_sse2(&even, &odd);
+
+ // Do again to get the second half of dst
+ src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+ src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+ src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+ src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+ // Output 14 12 10 8
+ even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+ &kernel_reg_45);
+
+ // Output 15 13 11 9
+ odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Combine to get the second half of the dst
+ dst_second = mm_zip_epi32_sse2(&even, &odd);
+
+ // Round each result
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+ dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
+
+ // Finally combine to get the final dst
+ dst_first = _mm_packus_epi16(dst_first, dst_second);
+ _mm_store_si128((__m128i *)dst_ptr, dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+/* The macro used to generate functions shifts the src_ptr up by 3 rows already
+ * */
+
+static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
+ __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
+ // Half of half of the interleaved rows
+ __m128i src_reg_m10_lo_1, src_reg_m10_lo_2, src_reg_m10_hi_1,
+ src_reg_m10_hi_2;
+ __m128i src_reg_01_lo_1, src_reg_01_lo_2, src_reg_01_hi_1, src_reg_01_hi_2;
+ __m128i src_reg_12_lo_1, src_reg_12_lo_2, src_reg_12_hi_1, src_reg_12_hi_2;
+ __m128i src_reg_23_lo_1, src_reg_23_lo_2, src_reg_23_hi_1, src_reg_23_hi_2;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+ __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+ __m128i res_reg_m1012, res_reg_0123;
+ __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+ // words,
+ // shuffle the data into the form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+ // so that we can call multiply and add with the kernel to get 32-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+ src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
+ src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+ src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
+ src_reg_m10_hi_1 = _mm_unpacklo_epi8(src_reg_m10_hi, _mm_setzero_si128());
+ src_reg_m10_hi_2 = _mm_unpackhi_epi8(src_reg_m10_hi, _mm_setzero_si128());
+
+ // More shuffling
+ src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+ src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
+ src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+ src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
+ src_reg_01_hi_1 = _mm_unpacklo_epi8(src_reg_01_hi, _mm_setzero_si128());
+ src_reg_01_hi_2 = _mm_unpackhi_epi8(src_reg_01_hi, _mm_setzero_si128());
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+ src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+ src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
+
+ // Partial output from first half
+ res_reg_m10_lo = mm_madd_packs_epi16_sse2(
+ &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
+
+ res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
+ &kernel_reg_23);
+
+ src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+ src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
+ res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
+ &kernel_reg_45);
+
+ src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+ src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
+ res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
+ &kernel_reg_45);
+
+ // Add to get first half of the results
+ res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+ res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+ // Now repeat everything again for the second half
+ // Partial output for second half
+ res_reg_m10_hi = mm_madd_packs_epi16_sse2(
+ &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23);
+
+ res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2,
+ &kernel_reg_23);
+
+ src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128());
+ src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128());
+ res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2,
+ &kernel_reg_45);
+
+ src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128());
+ src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128());
+ res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2,
+ &kernel_reg_45);
+
+ // Second half of the results
+ res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
+ res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
+
+ // Round the words
+ res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+ res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+ res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
+ res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
+ res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
+
+ _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10_lo_1 = src_reg_12_lo_1;
+ src_reg_m10_lo_2 = src_reg_12_lo_2;
+ src_reg_m10_hi_1 = src_reg_12_hi_1;
+ src_reg_m10_hi_2 = src_reg_12_hi_2;
+ src_reg_01_lo_1 = src_reg_23_lo_1;
+ src_reg_01_lo_2 = src_reg_23_lo_2;
+ src_reg_01_hi_1 = src_reg_23_hi_1;
+ src_reg_01_hi_2 = src_reg_23_hi_2;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+ __m128i dst_first;
+ __m128i even, odd;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ for (h = height; h > 0; --h) {
+ // We will load multiple shifted versions of the row and shuffle them into
+ // 16-bit words of the form
+ // ... s[2] s[1] s[0] s[-1]
+ // ... s[4] s[3] s[2] s[1]
+ // Then we call multiply and add to get partial results
+ // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+ // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+ // The two results are then added together to get the even output
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+ src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+ src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+ // Output 6 4 2 0
+ even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+ &kernel_reg_45);
+
+ // Output 7 5 3 1
+ odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Combine to get the first half of the dst
+ dst_first = mm_zip_epi32_sse2(&even, &odd);
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+ // Saturate and convert to 8-bit words
+ dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+
+ _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10_lo, src_reg_01_lo;
+ __m128i src_reg_12_lo, src_reg_23_lo;
+ // Half of half of the interleaved rows
+ __m128i src_reg_m10_lo_1, src_reg_m10_lo_2;
+ __m128i src_reg_01_lo_1, src_reg_01_lo_2;
+ __m128i src_reg_12_lo_1, src_reg_12_lo_2;
+ __m128i src_reg_23_lo_1, src_reg_23_lo_2;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+ __m128i res_reg_m1012, res_reg_0123;
+ __m128i res_reg_m1012_lo, res_reg_0123_lo;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+ // words,
+ // shuffle the data into the form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+ // so that we can call multiply and add with the kernel to get 32-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+ src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+ src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
+
+ // More shuffling
+ src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+ src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+ src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+ // Partial output
+ res_reg_m10_lo = mm_madd_packs_epi16_sse2(
+ &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
+
+ res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
+ &kernel_reg_23);
+
+ src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+ src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
+ res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
+ &kernel_reg_45);
+
+ src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+ src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
+ res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
+ &kernel_reg_45);
+
+ // Add to get results
+ res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+ res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+ // Round the words
+ res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+ res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+
+ // Convert to 8-bit words
+ res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128());
+ res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, _mm_setzero_si128());
+
+ // Save only half of the register (8 words)
+ _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10_lo_1 = src_reg_12_lo_1;
+ src_reg_m10_lo_2 = src_reg_12_lo_2;
+ src_reg_01_lo_1 = src_reg_23_lo_1;
+ src_reg_01_lo_2 = src_reg_23_lo_2;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+ __m128i dst_first;
+ __m128i tmp_0, tmp_1;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ for (h = height; h > 0; --h) {
+ // We will load multiple shifted versions of the row and shuffle them into
+ // 16-bit words of the form
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Then we call multiply and add to get partial results
+ // s[1]k[3]+s[0]k[2] s[0]k[3]s[-1]k[2]
+ // s[3]k[5]+s[2]k[4] s[2]k[5]s[1]k[4]
+ // The two results are then added together to get the output
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+ src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+ src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+ // Convert to 16-bit words
+ src_reg = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128());
+ src_reg_shift_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128());
+ src_reg_shift_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128());
+ src_reg_shift_3 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128());
+
+ // Shuffle into the right format
+ tmp_0 = _mm_unpacklo_epi32(src_reg, src_reg_shift_1);
+ tmp_1 = _mm_unpacklo_epi32(src_reg_shift_2, src_reg_shift_3);
+
+ // Partial output
+ tmp_0 = _mm_madd_epi16(tmp_0, kernel_reg_23);
+ tmp_1 = _mm_madd_epi16(tmp_1, kernel_reg_45);
+
+ // Output
+ dst_first = _mm_add_epi32(tmp_0, tmp_1);
+ dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128());
+
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+ // Saturate and convert to 8-bit words
+ dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10_lo, src_reg_01_lo;
+ __m128i src_reg_12_lo, src_reg_23_lo;
+ // Half of half of the interleaved rows
+ __m128i src_reg_m10_lo_1;
+ __m128i src_reg_01_lo_1;
+ __m128i src_reg_12_lo_1;
+ __m128i src_reg_23_lo_1;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+ __m128i res_reg_m1012, res_reg_0123;
+ __m128i res_reg_m1012_lo, res_reg_0123_lo;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ const __m128i reg_zero = _mm_setzero_si128();
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+ // words,
+ // shuffle the data into the form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+ // so that we can call multiply and add with the kernel to get 32-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+ src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+
+ // More shuffling
+ src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+ src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+ // Partial output
+ res_reg_m10_lo =
+ mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, &reg_zero, &kernel_reg_23);
+
+ res_reg_01_lo =
+ mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &reg_zero, &kernel_reg_23);
+
+ src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+ res_reg_12_lo =
+ mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &reg_zero, &kernel_reg_45);
+
+ src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+ res_reg_23_lo =
+ mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &reg_zero, &kernel_reg_45);
+
+ // Add to get results
+ res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+ res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+ // Round the words
+ res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+ res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+
+ // Convert to 8-bit words
+ res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero);
+ res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
+
+ // Save only half of the register (8 words)
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
+ *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10_lo_1 = src_reg_12_lo_1;
+ src_reg_01_lo_1 = src_reg_23_lo_1;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
+static void vpx_highbd_filter_block1d4_h4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load multiple shifted versions of the row and shuffle them into
+ // 16-bit words of the form
+ // ... s[2] s[1] s[0] s[-1]
+ // ... s[4] s[3] s[2] s[1]
+ // Then we call multiply and add to get partial results
+ // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+ // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+ // The two results are then added together to get the even output
+
+ __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+ __m128i res_reg;
+ __m128i even, odd;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_round =
+ _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i reg_zero = _mm_setzero_si128();
+ int h;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ for (h = height; h > 0; --h) {
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_1 = _mm_srli_si128(src_reg, 2);
+ src_reg_shift_2 = _mm_srli_si128(src_reg, 4);
+ src_reg_shift_3 = _mm_srli_si128(src_reg, 6);
+
+ // Output 2 0
+ even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+ &kernel_reg_45);
+
+ // Output 3 1
+ odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Combine to get the first half of the dst
+ res_reg = _mm_unpacklo_epi32(even, odd);
+ res_reg = mm_round_epi32_sse2(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg = _mm_packs_epi32(res_reg, reg_zero);
+
+ // Saturate the result and save
+ res_reg = _mm_min_epi16(res_reg, reg_max);
+ res_reg = _mm_max_epi16(res_reg, reg_zero);
+ _mm_storel_epi64((__m128i *)dst_ptr, res_reg);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_highbd_filter_block1d4_v4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load two rows of pixels as 16-bit words, and shuffle them into the
+ // form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+ // so that we can call multiply and add with the kernel to get 32-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10, src_reg_01;
+ __m128i src_reg_12, src_reg_23;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
+ __m128i res_reg_m1012, res_reg_0123;
+
+ const __m128i reg_round =
+ _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i reg_zero = _mm_setzero_si128();
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
+
+ // More shuffling
+ src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
+
+ // Partial output
+ res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23);
+ res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23);
+ res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45);
+ res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45);
+
+ // Add to get results
+ res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12);
+ res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23);
+
+ // Round the words
+ res_reg_m1012 =
+ mm_round_epi32_sse2(&res_reg_m1012, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg_0123 =
+ mm_round_epi32_sse2(&res_reg_0123, &reg_round, CONV8_ROUNDING_BITS);
+
+ res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero);
+ res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero);
+
+ // Saturate according to bit depth
+ res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
+ res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
+ res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
+ res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
+
+ // Save only half of the register (8 words)
+ _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10 = src_reg_12;
+ src_reg_01 = src_reg_23;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_highbd_filter_block1d8_h4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load multiple shifted versions of the row and shuffle them into
+ // 16-bit words of the form
+ // ... s[2] s[1] s[0] s[-1]
+ // ... s[4] s[3] s[2] s[1]
+ // Then we call multiply and add to get partial results
+ // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+ // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+ // The two results are then added together for the first half of even
+ // output.
+ // Repeat multiple times to get the whole outoput
+
+ __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2,
+ src_reg_shift_3;
+ __m128i res_reg;
+ __m128i even, odd;
+ __m128i tmp_0, tmp_1;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_round =
+ _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i reg_zero = _mm_setzero_si128();
+ int h;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ for (h = height; h > 0; --h) {
+ // We will put first half in the first half of the reg, and second half in
+ // second half
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+ // Output 6 4 2 0
+ tmp_0 = _mm_srli_si128(src_reg, 4);
+ tmp_1 = _mm_srli_si128(src_reg_next, 2);
+ src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+ even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+ &kernel_reg_45);
+
+ // Output 7 5 3 1
+ tmp_0 = _mm_srli_si128(src_reg, 2);
+ tmp_1 = src_reg_next;
+ src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+
+ tmp_0 = _mm_srli_si128(src_reg, 6);
+ tmp_1 = _mm_srli_si128(src_reg_next, 4);
+ src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+
+ odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Combine to get the first half of the dst
+ even = mm_round_epi32_sse2(&even, &reg_round, CONV8_ROUNDING_BITS);
+ odd = mm_round_epi32_sse2(&odd, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg = mm_zip_epi32_sse2(&even, &odd);
+
+ // Saturate the result and save
+ res_reg = _mm_min_epi16(res_reg, reg_max);
+ res_reg = _mm_max_epi16(res_reg, reg_zero);
+
+ _mm_store_si128((__m128i *)dst_ptr, res_reg);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_highbd_filter_block1d8_v4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load two rows of pixels as 16-bit words, and shuffle them into the
+ // form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+ // so that we can call multiply and add with the kernel to get 32-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi;
+ __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi;
+
+ // Result after multiply and add
+ __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+ __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+ __m128i res_reg_m1012, res_reg_0123;
+ __m128i res_reg_m1012_lo, res_reg_0123_lo;
+ __m128i res_reg_m1012_hi, res_reg_0123_hi;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ const __m128i reg_round =
+ _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i reg_zero = _mm_setzero_si128();
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
+ src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0);
+
+ // More shuffling
+ src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
+ src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
+ src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
+ src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3);
+
+ // Partial output for first half
+ res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23);
+ res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23);
+ res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45);
+ res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45);
+
+ // Add to get results
+ res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo);
+ res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo);
+
+ // Round the words
+ res_reg_m1012_lo =
+ mm_round_epi32_sse2(&res_reg_m1012_lo, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg_0123_lo =
+ mm_round_epi32_sse2(&res_reg_0123_lo, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Partial output for first half
+ res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23);
+ res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23);
+ res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45);
+ res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45);
+
+ // Add to get results
+ res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi);
+ res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi);
+
+ // Round the words
+ res_reg_m1012_hi =
+ mm_round_epi32_sse2(&res_reg_m1012_hi, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg_0123_hi =
+ mm_round_epi32_sse2(&res_reg_0123_hi, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Combine the two halfs
+ res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi);
+ res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi);
+
+ // Saturate according to bit depth
+ res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
+ res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
+ res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
+ res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
+
+ // Save only half of the register (8 words)
+ _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10_lo = src_reg_12_lo;
+ src_reg_m10_hi = src_reg_12_hi;
+ src_reg_01_lo = src_reg_23_lo;
+ src_reg_01_hi = src_reg_23_hi;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, kernel, bd);
+ vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
+ dst_stride, height, kernel, bd);
+}
+
+static void vpx_highbd_filter_block1d16_v4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, kernel, bd);
+ vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
+ dst_stride, height, kernel, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
+
+// From vpx_subpixel_8t_sse2.asm.
+filter8_1dfunction vpx_filter_block1d16_v8_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_sse2;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2
+#define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2
+#define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2
+#define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2
+#define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2
+#define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2
+
+// From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm.
+filter8_1dfunction vpx_filter_block1d16_v2_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_sse2;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
+
+// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
+ sse2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1)
+
+// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, sse2, 0)
+FUN_CONV_2D(avg_, sse2, 1)
+
+#if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_highbd_filter_block1d16_v4_avg_sse2 \
+ vpx_highbd_filter_block1d16_v8_avg_sse2
+#define vpx_highbd_filter_block1d16_h4_avg_sse2 \
+ vpx_highbd_filter_block1d16_h8_avg_sse2
+#define vpx_highbd_filter_block1d8_v4_avg_sse2 \
+ vpx_highbd_filter_block1d8_v8_avg_sse2
+#define vpx_highbd_filter_block1d8_h4_avg_sse2 \
+ vpx_highbd_filter_block1d8_h8_avg_sse2
+#define vpx_highbd_filter_block1d4_v4_avg_sse2 \
+ vpx_highbd_filter_block1d4_v8_avg_sse2
+#define vpx_highbd_filter_block1d4_h4_avg_sse2 \
+ vpx_highbd_filter_block1d4_h8_avg_sse2
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+
+// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), , sse2, 0)
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1)
+
+// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h, int bd);
+HIGH_FUN_CONV_2D(, sse2, 0)
+HIGH_FUN_CONV_2D(avg_, sse2, 1)
+#endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
new file mode 100644
index 0000000000..2498bba173
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,1458 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_avx2.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_ports/mem.h"
+
+// filters for 16_h8
+DECLARE_ALIGNED(32, static const uint8_t,
+ filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+ 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
+ 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
+ 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2,
+ 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9,
+ 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
+
+#define CALC_CONVOLVE8_HORZ_ROW \
+ srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3); \
+ s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \
+ s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \
+ s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \
+ s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]); \
+ s1[0] = convolve8_16_avx2(s1, f1); \
+ s1[0] = _mm256_packus_epi16(s1[0], s1[0]); \
+ src_ptr += src_stride; \
+ _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \
+ output_ptr += output_pitch; \
+ _mm_storel_epi64((__m128i *)&output_ptr[0], \
+ _mm256_extractf128_si256(s1[0], 1)); \
+ output_ptr += output_pitch;
+
+// 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+ // 0 0 0 0 0 0 0 0 | 0 0 0 0 lo3 lo2 lo1 lo0
+ __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+
+ // 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
+ a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+ return a;
+}
+
+static INLINE void vpx_filter_block1d16_h8_x_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
+ const int avg) {
+ __m128i outReg1, outReg2;
+ __m256i outReg32b1, outReg32b2;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ __m256i f[4], filt[4], s[4];
+
+ shuffle_filter_avx2(filter, f);
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ __m256i srcReg;
+
+ // load the 2 strides of source
+ srcReg =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
+ srcReg = _mm256_inserti128_si256(
+ srcReg,
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
+ 1);
+
+ // filter the source buffer
+ s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+ outReg32b1 = convolve8_16_avx2(s, f);
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
+ srcReg = _mm256_inserti128_si256(
+ srcReg,
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
+ 1);
+
+ // filter the source buffer
+ s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+ outReg32b2 = convolve8_16_avx2(s, f);
+
+ // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+ // contain the first and second convolve result respectively
+ outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2);
+
+ src_ptr += src_stride;
+
+ // average if necessary
+ outReg1 = _mm256_castsi256_si128(outReg32b1);
+ outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ outReg2 = _mm_avg_epu8(
+ outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+ }
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
+
+ // save the next 16 bits
+ _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
+
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m128i srcReg;
+
+ // load the first 16 bytes of the last row
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ s[0] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
+ s[1] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
+ s[2] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
+ s[3] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
+ outReg1 = convolve8_8_avx2(s, f);
+
+ // reading the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+ // filter the source buffer
+ s[0] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
+ s[1] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
+ s[2] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
+ s[3] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
+ outReg2 = convolve8_8_avx2(s, f);
+
+ // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+ // contain the first and second convolve result respectively
+ outReg1 = _mm_packus_epi16(outReg1, outReg2);
+
+ // average if necessary
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ }
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
+ }
+}
+
+static void vpx_filter_block1d16_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+ ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+ vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+ output_height, filter, 0);
+}
+
+static void vpx_filter_block1d16_h8_avg_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+ ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+ vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+ output_height, filter, 1);
+}
+
+static void vpx_filter_block1d8_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m256i filt[4], f1[4], s1[4], srcReg;
+ __m128i f[4], s[4];
+ int y = output_height;
+
+ // Multiply the size of the source stride by two
+ const ptrdiff_t src_stride = src_pitch << 1;
+
+ shuffle_filter_avx2(filter, f1);
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ // Process next 4 rows
+ while (y > 3) {
+ CALC_CONVOLVE8_HORZ_ROW
+ CALC_CONVOLVE8_HORZ_ROW
+ y -= 4;
+ }
+
+ // If remaining, then process 2 rows at a time
+ while (y > 1) {
+ CALC_CONVOLVE8_HORZ_ROW
+ y -= 2;
+ }
+
+ // For the remaining height.
+ if (y > 0) {
+ const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ f[0] = _mm256_castsi256_si128(f1[0]);
+ f[1] = _mm256_castsi256_si128(f1[1]);
+ f[2] = _mm256_castsi256_si128(f1[2]);
+ f[3] = _mm256_castsi256_si128(f1[3]);
+
+ // filter the source buffer
+ s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0]));
+ s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1]));
+ s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2]));
+ s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3]));
+ s[0] = convolve8_8_ssse3(s, f);
+
+ // Saturate 16bit value to 8bit.
+ s[0] = _mm_packus_epi16(s[0], s[0]);
+
+ // Save only 8 bytes
+ _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
+ }
+}
+
+static INLINE void vpx_filter_block1d16_v8_x_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
+ const int avg) {
+ __m128i outReg1, outReg2;
+ __m256i srcRegHead1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ __m256i f[4], s1[4], s2[4];
+
+ shuffle_filter_avx2(filter, f);
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ {
+ __m128i s[6];
+ __m256i s32b[6];
+
+ // load 16 bytes 7 times in stride of src_pitch
+ s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch));
+ srcRegHead1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch)));
+
+ // have each consecutive loads on the same 256 register
+ s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+ s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+ s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+ s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+ s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+ s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]),
+ _mm256_castsi256_si128(srcRegHead1), 1);
+
+ // merge every two consecutive registers except the last one
+ // the first lanes contain values for filtering odd rows (1,3,5...) and
+ // the second lanes contain values for filtering even rows (2,4,6...)
+ s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]);
+ s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]);
+ s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]);
+ s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]);
+ s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]);
+ s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
+ }
+
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 1));
+
+ for (i = output_height; i > 1; i -= 2) {
+ __m256i srcRegHead2, srcRegHead3;
+
+ // load the next 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcRegHead2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch)));
+ srcRegHead1 = _mm256_inserti128_si256(
+ srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1);
+ srcRegHead3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch)));
+ srcRegHead2 = _mm256_inserti128_si256(
+ srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1);
+
+ // merge the two new consecutive registers
+ // the first lane contain values for filtering odd rows (1,3,5...) and
+ // the second lane contain values for filtering even rows (2,4,6...)
+ s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2);
+ s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2);
+
+ s1[0] = convolve8_16_avx2(s1, f);
+ s2[0] = convolve8_16_avx2(s2, f);
+
+ // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+ // contain the first and second convolve result respectively
+ s1[0] = _mm256_packus_epi16(s1[0], s2[0]);
+
+ src_ptr += src_stride;
+
+ // average if necessary
+ outReg1 = _mm256_castsi256_si128(s1[0]);
+ outReg2 = _mm256_extractf128_si256(s1[0], 1);
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ outReg2 = _mm_avg_epu8(
+ outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+ }
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
+
+ // save the next 16 bits
+ _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
+
+ output_ptr += dst_stride;
+
+ // shift down by two rows
+ s1[0] = s1[1];
+ s2[0] = s2[1];
+ s1[1] = s1[2];
+ s2[1] = s2[2];
+ s1[2] = s1[3];
+ s2[2] = s2[3];
+ srcRegHead1 = srcRegHead3;
+ }
+}
+
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *filter) {
+ vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, filter, 0);
+}
+
+static void vpx_filter_block1d16_v8_avg_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) {
+ vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, filter, 1);
+}
+
+static void vpx_filter_block1d16_h4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into two registers in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add gives us
+ // first half of the output. Repeat again to get the second half of the
+ // output. Finally we shuffle again to combine the two outputs.
+ // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
+ // time.
+
+ __m128i kernel_reg; // Kernel
+ __m256i kernel_reg_256, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+ int h;
+
+ __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m256i dst_first, dst_second;
+ __m256i tmp_0, tmp_1;
+ __m256i idx_shift_0 =
+ _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+ 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m256i idx_shift_2 =
+ _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
+ 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
+ kernel_reg_23 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
+
+ for (h = height; h >= 2; h -= 2) {
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Partial result for first half
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ // Do again to get the second half of dst
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr + 8, src_ptr + src_stride + 8);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Partial result for second half
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_second = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ // Round each result
+ dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
+ dst_second = mm256_round_epi16(&dst_second, &reg_32, 6);
+
+ // Finally combine to get the final dst
+ dst_first = _mm256_packus_epi16(dst_first, dst_second);
+ mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &dst_first);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ // Repeat for the last row if needed
+ if (h > 0) {
+ src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
+ // Reorder into 2 1 1 2
+ src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
+
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
+
+ dst_first = _mm256_packus_epi16(dst_first, dst_first);
+ dst_first = _mm256_permute4x64_epi64(dst_first, 0x8);
+
+ _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(dst_first));
+ }
+}
+
+static void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call add with another row to get the output.
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
+
+ __m128i kernel_reg; // Kernel
+ __m256i kernel_reg_256, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m256i res_reg_m1001_lo, res_reg_1223_lo, res_reg_m1001_hi, res_reg_1223_hi;
+ __m256i res_reg, res_reg_lo, res_reg_hi;
+
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
+ kernel_reg_23 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001_lo = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+ src_reg_m1001_hi = _mm256_unpackhi_epi8(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223_lo = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+ src_reg_1223_hi = _mm256_unpackhi_epi8(src_reg_12, src_reg_23);
+
+ // Output from first half
+ res_reg_m1001_lo = _mm256_maddubs_epi16(src_reg_m1001_lo, kernel_reg_23);
+ res_reg_1223_lo = _mm256_maddubs_epi16(src_reg_1223_lo, kernel_reg_45);
+ res_reg_lo = _mm256_adds_epi16(res_reg_m1001_lo, res_reg_1223_lo);
+
+ // Output from second half
+ res_reg_m1001_hi = _mm256_maddubs_epi16(src_reg_m1001_hi, kernel_reg_23);
+ res_reg_1223_hi = _mm256_maddubs_epi16(src_reg_1223_hi, kernel_reg_45);
+ res_reg_hi = _mm256_adds_epi16(res_reg_m1001_hi, res_reg_1223_hi);
+
+ // Round the words
+ res_reg_lo = mm256_round_epi16(&res_reg_lo, &reg_32, 6);
+ res_reg_hi = mm256_round_epi16(&res_reg_hi, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi);
+
+ // Save the result
+ mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001_lo = src_reg_1223_lo;
+ src_reg_m1001_hi = src_reg_1223_hi;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into two registers in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add gives us
+ // first half of the output. Repeat again to get the second half of the
+ // output. Finally we shuffle again to combine the two outputs.
+ // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
+ // time.
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+ int h;
+
+ __m256i idx_shift_0 =
+ _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+ 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m256i idx_shift_2 =
+ _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
+ 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
+
+ for (h = height; h >= 2; h -= 2) {
+ // Load the source
+ const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+ __m256i dst_reg;
+ __m256i tmp_0, tmp_1;
+ const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Get the output
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_reg = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ // Round the result
+ dst_reg = mm256_round_epi16(&dst_reg, &reg_32, 6);
+
+ // Finally combine to get the final dst
+ dst_reg = _mm256_packus_epi16(dst_reg, dst_reg);
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &dst_reg);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ // Repeat for the last row if needed
+ if (h > 0) {
+ const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ __m128i dst_reg;
+ const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding
+ __m128i tmp_0, tmp_1;
+
+ __m128i src_reg_shift_0 =
+ _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_0));
+ __m128i src_reg_shift_2 =
+ _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_2));
+
+ tmp_0 = _mm_maddubs_epi16(src_reg_shift_0,
+ _mm256_castsi256_si128(kernel_reg_23));
+ tmp_1 = _mm_maddubs_epi16(src_reg_shift_2,
+ _mm256_castsi256_si128(kernel_reg_45));
+ dst_reg = _mm_adds_epi16(tmp_0, tmp_1);
+
+ dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32_128, 6);
+
+ dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128());
+
+ _mm_storel_epi64((__m128i *)dst_ptr, dst_reg);
+ }
+}
+
+static void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call add with another row to get the output.
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001, src_reg_1223;
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m256i res_reg_m1001, res_reg_1223;
+ __m256i res_reg;
+
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+
+ // Output
+ res_reg_m1001 = _mm256_maddubs_epi16(src_reg_m1001, kernel_reg_23);
+ res_reg_1223 = _mm256_maddubs_epi16(src_reg_1223, kernel_reg_45);
+ res_reg = _mm256_adds_epi16(res_reg_m1001, res_reg_1223);
+
+ // Round the words
+ res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi16(res_reg, res_reg);
+
+ // Save the result
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001 = src_reg_1223;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into a single register in the form
+ // k[5:2] k[5:2] k[5:2] k[5:2]
+ // Then we shuffle the source into
+ // s[5:2] s[4:1] s[3:0] s[2:-1]
+ // Calling multiply and add gives us half of the sum next to each other.
+ // Calling horizontal add then gives us the output.
+ // Since avx2 has 256-bit register, we can do 2 rows at a time.
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg;
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+ int h;
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+
+ __m256i shuf_idx =
+ _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2,
+ 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
+
+ for (h = height; h > 1; h -= 2) {
+ // Load the source
+ const __m256i src_reg = mm256_loadu2_epi64(
+ (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride));
+ const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
+
+ // Get the result
+ __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
+ dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256());
+
+ // Round result
+ dst = mm256_round_epi16(&dst, &reg_32, 6);
+
+ // Pack to 8-bits
+ dst = _mm256_packus_epi16(dst, _mm256_setzero_si256());
+
+ // Save
+ mm256_storeu2_epi32((__m128i *const)dst_ptr,
+ (__m128i *const)(dst_ptr + dst_stride), &dst);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ if (h > 0) {
+ // Load the source
+ const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding
+ __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr);
+ __m128i src_reg_shuf =
+ _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx));
+
+ // Get the result
+ __m128i dst =
+ _mm_maddubs_epi16(src_reg_shuf, _mm256_castsi256_si128(kernel_reg));
+ dst = _mm_hadds_epi16(dst, _mm_setzero_si128());
+
+ // Round result
+ dst = mm_round_epi16_sse2(&dst, &reg_32_128, 6);
+
+ // Pack to 8-bits
+ dst = _mm_packus_epi16(dst, _mm_setzero_si128());
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
+ }
+}
+
+static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[3,0] s[2,0] s[1,0] s[0,0] s[2,0] s[1,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel to get partial output.
+ // Calling horizontal add then gives us the completely output
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001, src_reg_1223, src_reg_m1012_1023;
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg;
+
+ // Result after multiply and add
+ __m256i res_reg;
+
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+
+ // Combine all the rows
+ src_reg_m1012_1023 = _mm256_unpacklo_epi16(src_reg_m1001, src_reg_1223);
+
+ // Output
+ res_reg = _mm256_maddubs_epi16(src_reg_m1012_1023, kernel_reg);
+ res_reg = _mm256_hadds_epi16(res_reg, _mm256_setzero_si256());
+
+ // Round the words
+ res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi16(res_reg, res_reg);
+
+ // Save the result
+ mm256_storeu2_epi32((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001 = src_reg_1223;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d8_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m256i f[4], ss[4];
+ __m256i r[8];
+ __m128i s[9];
+
+ unsigned int y = output_height;
+ // Multiply the size of the source stride by two
+ const ptrdiff_t src_stride = src_pitch << 1;
+
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 1));
+
+ shuffle_filter_avx2(filter, f);
+ s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+ // merge the result together
+ // r[0]: 0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0
+ // r07 r06 r05 r04 r03 r02 r01 r00
+ r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+
+ // r[1]: 0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0
+ // r17 r16 r15 r14 r13 r12 r11 r10
+ r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+
+ // r[2]: 0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0
+ // r27 r26 r25 r24 r23 r22 r21 r20
+ r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+
+ // r[3]: 0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0
+ // r37 r36 r35 r34 r33 r32 r31 r30
+ r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+
+ // r[4]: 0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0
+ // r47 r46 r45 r44 r43 r42 r41 r40
+ r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+
+ // r[5]: 0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0
+ // r57 r56 r55 r54 r53 r52 r51 r50
+ r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1);
+
+ // Merge together
+ // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11
+ // r01|r10 r00|
+ ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+ // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31
+ // r21|r30 r20|
+ ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+ // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51
+ // r41|r50 r40|
+ ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+ // Process 2 rows at a time
+ do {
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+ s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+ // r[6]: 0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0
+ // 0 r67 r66 r65 r64 r63 r62 r61 r60
+ r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1);
+ // r[7]: 0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0
+ // 0 r77 r76 r75 r74 r73 r72 r71 r70
+ r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1);
+
+ // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72
+ // r62 | r71 r61|r70 r60|
+ ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+ ss[0] = convolve8_16_avx2(ss, f);
+ ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+ src_ptr += src_stride;
+
+ /* shift down two rows */
+ s[6] = s[8];
+ _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0]));
+ output_ptr += out_pitch;
+ _mm_storel_epi64((__m128i *)&output_ptr[0],
+ _mm256_extractf128_si256(ss[0], 1));
+ output_ptr += out_pitch;
+ ss[0] = ss[1];
+ ss[1] = ss[2];
+ ss[2] = ss[3];
+ y -= 2;
+ } while (y > 1);
+}
+
+static void vpx_filter_block1d4_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg64_256bit;
+ unsigned int y = output_height;
+
+ assert(output_height > 1);
+
+ addFilterReg64_256bit = _mm256_set1_epi16(32);
+
+ // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit)
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each)
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ {
+ ptrdiff_t src_stride;
+ __m256i filt1Reg, filt2Reg, firstFilters, secondFilters;
+ // have the same data in both lanes of a 256 bit register
+ // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0
+ // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each)
+ const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+
+ // duplicate only the first 32 bits
+ // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1
+ // f0|f3 f2 f1 f0|f3 f2 f1 f0
+ firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+ // duplicate only the second 32 bits
+ // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5
+ // f4|f7 f6 f5 f4|f7 f6 f5 f4
+ secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+ // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3
+ // s2 s4 s3 s2 s1 s3 s2 s1 s0
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+
+ // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7
+ // s6 s8 s7 s6 s5 s7 s6 s5 s4
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+
+ do {
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1;
+ // load the 2 strides of source
+ // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07
+ // r06 r05 r04 r03 r02 r01 r00
+ srcReg32b1 = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3);
+
+ // filter the source buffer
+ // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06
+ // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||...
+ // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ // filter the source buffer
+ // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010
+ // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010
+ // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+ srcRegFilt32b1_1 =
+ _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit);
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit)
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+
+ // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit)
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+ // save first row 4 values
+ *((int *)&output_ptr[0]) =
+ _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1));
+ output_ptr += output_pitch;
+
+ // save second row 4 values
+ *((int *)&output_ptr[0]) =
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+ output_ptr += output_pitch;
+
+ y = y - 2;
+ } while (y > 1);
+
+ // For remaining height
+ if (y > 0) {
+ __m128i srcReg1, srcRegFilt1_1, addFilterReg64;
+ __m128i srcRegFilt2;
+
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1_1 =
+ _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+ _mm256_castsi256_si128(firstFilters));
+
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 4 bytes
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
+ }
+}
+
+static void vpx_filter_block1d4_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m256i f[4], ss[4];
+ __m256i r[8];
+ __m128i r1[10];
+ __m128i s[11];
+
+ unsigned int y = output_height;
+ // Multiply the size of the source stride by four
+ const ptrdiff_t src_stride = src_pitch << 2;
+ const ptrdiff_t out_stride = out_pitch << 2;
+
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 0x01));
+
+ shuffle_filter_avx2(filter, f);
+
+ s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+ // R1-0 xxxx .. . . x| r13 r12 r11 r10 r03 r02 r01 r00
+ r1[0] = _mm_unpacklo_epi32(s[0], s[1]);
+
+ // R2-1 xxxx .. . . x| r23 r22 r21 r20 r13 r12 r11 r10
+ r1[1] = _mm_unpacklo_epi32(s[1], s[2]);
+
+ // R3-2 xxxx .. . . x| r33 r32 r31 r30 r23 r22 r21 r20
+ r1[2] = _mm_unpacklo_epi32(s[2], s[3]);
+
+ // R4-3 xxxx .. . . x| r43 r42 r41 r40 r33 r32 r31 r30
+ r1[3] = _mm_unpacklo_epi32(s[3], s[4]);
+
+ // R5-4 xxxx .. . . x| r53 r52 r51 r50 r43 r42 r41 r40
+ r1[4] = _mm_unpacklo_epi32(s[4], s[5]);
+
+ // R6-5 xxxx .. . . x| r63 r62 r61 r60 r53 r52 r51 r50
+ r1[5] = _mm_unpacklo_epi32(s[5], s[6]);
+
+ // 00000000 r33 r32 r31 r30|r23 r22 r21 r20||00000000|r13 r12 r11 r10|r03 r02
+ // r01 r00
+ r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[0]), r1[2], 1);
+
+ // 00000000 r43 r42 r41 r40|r33 r32 r31 r30||00000000|r23 r22 r21 r20|r13 r12
+ // r11 r10
+ r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[1]), r1[3], 1);
+
+ // 00000000 r53 r52 r51 r50|r43 r42 r41 r40||00000000|r33 r32 r31 r30|r23 r22
+ // r21 r20
+ r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[2]), r1[4], 1);
+
+ // 00000000 r63 r62 r61 r60|r53 r52 r51 r50||00000000|r43 r42 r41 r40|r33 r32
+ // r31 r30
+ r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[3]), r1[5], 1);
+
+ // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10
+ // r00|
+ ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+ // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30
+ // r20|
+ ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+ // Process 4 rows at a time
+ while (y >= 4) {
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+ s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+ s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch));
+ s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch));
+
+ // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
+ r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+
+ // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
+ r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
+
+ // R9-8 xxxx .. . . x| r93 r92 r91 r90 r83 r82 r81 r80
+ r1[8] = _mm_unpacklo_epi32(s[8], s[9]);
+
+ // R10-9 xxxx .. . . x| r10-3 r10-2 r10-1 r10-0 r93 r92 r91 r90
+ r1[9] = _mm_unpacklo_epi32(s[9], s[10]);
+
+ // 00000000 r73 r72 r71 r70|r63 r62 r61 r60||00000000|r53 r52 r51 r50|r43
+ // r42 r41 r40
+ r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[4]), r1[6], 1);
+
+ // 00000000 r83 r82 r81 r80|r73 r72 r71 r70||00000000|r63 r62 r61 r60|r53
+ // r52 r51 r50
+ r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[5]), r1[7], 1);
+
+ // 00000000 r93 r92 r91 r90|r83 r82 r81 r80||00000000|r73 r72 r71 r70|r63
+ // r62 r61 r60
+ r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[6]), r1[8], 1);
+
+ // 00000000 r10-3 r10-2 r10-1 r10-0|r93 r92 r91 r90||00000000|r83 r82 r81
+ // r80|r73 r72 r71 r70
+ r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[7]), r1[9], 1);
+
+ // r83 r73....r80 r70|r73 r63....r70 r60||r63 r53....r60 r50|r53 r43....r50
+ // r40|
+ ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+ // r10-3 r10-3....r10-0 r10-0|r93 r83....r90 r80||r83 r73....r80 r70|r73
+ // r63....r70 r60|
+ ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+
+ ss[0] = convolve8_16_avx2(ss, f);
+
+ // r3 r2 r3 r2 r1 r0 r1 r0
+ ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+ src_ptr += src_stride;
+
+ mm256_storeu2_epi32((__m128i *const)output_ptr,
+ (__m128i *const)(output_ptr + (2 * out_pitch)), ss);
+
+ ss[0] = _mm256_srli_si256(ss[0], 4);
+
+ mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)),
+ (__m128i *const)(output_ptr + (3 * out_pitch)), ss);
+
+ output_ptr += out_stride;
+
+ ss[0] = ss[2];
+ ss[1] = ss[3];
+
+ s[6] = s[10];
+
+ r1[4] = r1[8];
+ r1[5] = r1[9];
+
+ y -= 4;
+ }
+
+ // Process 2 rows
+ if (y == 2) {
+ __m128i ss1[4], f1[4];
+
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+ s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+ f1[0] = _mm256_castsi256_si128(f[0]);
+ f1[1] = _mm256_castsi256_si128(f[1]);
+ f1[2] = _mm256_castsi256_si128(f[2]);
+ f1[3] = _mm256_castsi256_si128(f[3]);
+
+ // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
+ r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+
+ // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
+ r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
+
+ // r23 r13....r20 r10|r13 r03....r10 r00
+ ss1[0] = _mm256_castsi256_si128(ss[0]);
+
+ // r43 r33....r40 r30|r33 r23....r30 r20
+ ss1[1] = _mm256_castsi256_si128(ss[1]);
+
+ // r63 r53....r60 r50|r53 r43....r50 r40
+ ss1[2] = _mm_unpacklo_epi8(r1[4], r1[5]);
+
+ // r83 r73....r80 r70|r73 r63....r70 r60
+ ss1[3] = _mm_unpacklo_epi8(r1[6], r1[7]);
+
+ ss1[0] = convolve8_8_ssse3(ss1, f1);
+
+ // r1 r0 r1 r0
+ ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]);
+
+ // Save first row 4 values
+ *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+ output_ptr += out_pitch;
+
+ ss1[0] = _mm_srli_si128(ss1[0], 4);
+ // Save second row 4 values
+ *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+ }
+}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+#if VPX_ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+#else // VPX_ARCH_X86
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+#endif // VPX_ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
+#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
+#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
+#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
+#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
+#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3
+#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3
+#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3
+#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3
+#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
+#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3
+
+#define vpx_filter_block1d16_v4_avg_avx2 vpx_filter_block1d16_v8_avg_avx2
+#define vpx_filter_block1d16_h4_avg_avx2 vpx_filter_block1d16_h8_avg_avx2
+#define vpx_filter_block1d8_v4_avg_avx2 vpx_filter_block1d8_v8_avg_avx2
+#define vpx_filter_block1d8_h4_avg_avx2 vpx_filter_block1d8_h8_avg_avx2
+#define vpx_filter_block1d4_v4_avg_avx2 vpx_filter_block1d4_v8_avg_avx2
+#define vpx_filter_block1d4_h4_avg_avx2 vpx_filter_block1d4_h8_avg_avx2
+// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+ avx2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
+
+// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, avx2, 0)
+FUN_CONV_2D(avg_, avx2, 1)
+#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000000..4ea2752d38
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,1087 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h> // SSSE3
+
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+ const __m128i *const s, const int16_t *const filter) {
+ __m128i f[4];
+ shuffle_filter_ssse3(filter, f);
+ return convolve8_8_ssse3(s, f);
+}
+
+// Used by the avx2 implementation.
+#if VPX_ARCH_X86_64
+// Use the intrinsics below
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
+#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
+#else // VPX_ARCH_X86
+// Use the assembly in vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm.
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+#endif
+
+#if VPX_ARCH_X86_64
+void vpx_filter_block1d4_h8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i firstFilters, secondFilters, shuffle1, shuffle2;
+ __m128i srcRegFilt1, srcRegFilt2;
+ __m128i addFilterReg64, filtersReg, srcReg;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter into the first lane
+ firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+ // duplicate only the third 16 bit in the filter into the first lane
+ secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+ // duplicate only the seconds 16 bits in the filter into the second lane
+ // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
+ firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+ // duplicate only the forth 16 bits in the filter into the second lane
+ // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
+ secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+ // loading the local filters
+ shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6);
+ shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // sum the results together, saturating only on the final step
+ // the specific order of the additions prevents outranges
+ srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
+
+ // extract the higher half of the register
+ srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
+
+ // add the rounding offset early to avoid another saturated add
+ srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+
+ // shift by 7 bit each 16 bits
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+ src_ptr += src_pitch;
+
+ // save only 4 bytes
+ *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void vpx_filter_block1d8_h8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ unsigned int i;
+ __m128i f[4], filt[4], s[4];
+
+ shuffle_filter_ssse3(filter, f);
+ filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+ filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+ filt[3] =
+ _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
+
+ for (i = 0; i < output_height; i++) {
+ const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ s[0] = _mm_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm_shuffle_epi8(srcReg, filt[3]);
+ s[0] = convolve8_8_ssse3(s, f);
+
+ // shrink to 8 bit each 16 bits
+ s[0] = _mm_packus_epi16(s[0], s[0]);
+
+ src_ptr += src_pitch;
+
+ // save only 8 bytes
+ _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void vpx_filter_block1d8_v8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ unsigned int i;
+ __m128i f[4], s[8], ss[4];
+
+ shuffle_filter_ssse3(filter, f);
+
+ // load the first 7 rows of 8 bytes
+ s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+ for (i = 0; i < output_height; i++) {
+ // load the last 8 bytes
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+
+ // merge the result together
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+
+ // merge the result together
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+ ss[0] = convolve8_8_ssse3(ss, f);
+ // shrink to 8 bit each 16 bits
+ ss[0] = _mm_packus_epi16(ss[0], ss[0]);
+
+ src_ptr += src_pitch;
+
+ // shift down a row
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+ s[3] = s[4];
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]);
+
+ output_ptr += out_pitch;
+ }
+}
+#endif // VPX_ARCH_X86_64
+
+static void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into two registers in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add gives us
+ // first half of the output. Repeat again to get the second half of the
+ // output. Finally we shuffle again to combine the two outputs.
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m128i dst_first, dst_second;
+ __m128i tmp_0, tmp_1;
+ __m128i idx_shift_0 =
+ _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m128i idx_shift_2 =
+ _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+ for (h = height; h > 0; --h) {
+ // Load the source
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Partial result for first half
+ tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_first = _mm_adds_epi16(tmp_0, tmp_1);
+
+ // Do again to get the second half of dst
+ // Load the source
+ src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+ src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Partial result for first half
+ tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_second = _mm_adds_epi16(tmp_0, tmp_1);
+
+ // Round each result
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+ dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
+
+ // Finally combine to get the final dst
+ dst_first = _mm_packus_epi16(dst_first, dst_second);
+ _mm_store_si128((__m128i *)dst_ptr, dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // so that we can call multiply and add with the kernel to get 16-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
+ __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+ __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+ __m128i res_reg_m1012, res_reg_0123;
+ __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+ src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
+
+ // More shuffling
+ src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+ src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+ src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+ src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
+
+ // Partial output from first half
+ res_reg_m10_lo = _mm_maddubs_epi16(src_reg_m10_lo, kernel_reg_23);
+ res_reg_01_lo = _mm_maddubs_epi16(src_reg_01_lo, kernel_reg_23);
+
+ res_reg_12_lo = _mm_maddubs_epi16(src_reg_12_lo, kernel_reg_45);
+ res_reg_23_lo = _mm_maddubs_epi16(src_reg_23_lo, kernel_reg_45);
+
+ // Add to get first half of the results
+ res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+ res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+ // Partial output for second half
+ res_reg_m10_hi = _mm_maddubs_epi16(src_reg_m10_hi, kernel_reg_23);
+ res_reg_01_hi = _mm_maddubs_epi16(src_reg_01_hi, kernel_reg_23);
+
+ res_reg_12_hi = _mm_maddubs_epi16(src_reg_12_hi, kernel_reg_45);
+ res_reg_23_hi = _mm_maddubs_epi16(src_reg_23_hi, kernel_reg_45);
+
+ // Second half of the results
+ res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
+ res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
+
+ // Round the words
+ res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+ res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+ res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
+ res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
+ res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
+
+ _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10_lo = src_reg_12_lo;
+ src_reg_m10_hi = src_reg_12_hi;
+ src_reg_01_lo = src_reg_23_lo;
+ src_reg_01_hi = src_reg_23_hi;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into two registers in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add gives us
+ // first half of the output. Repeat again to get the second half of the
+ // output. Finally we shuffle again to combine the two outputs.
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m128i dst_first;
+ __m128i tmp_0, tmp_1;
+ __m128i idx_shift_0 =
+ _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m128i idx_shift_2 =
+ _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+ for (h = height; h > 0; --h) {
+ // Load the source
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Get the result
+ tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_first = _mm_adds_epi16(tmp_0, tmp_1);
+
+ // Round round result
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+ // Pack to 8-bits
+ dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+ _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel to get 16-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10, src_reg_01;
+ __m128i src_reg_12, src_reg_23;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
+ __m128i res_reg_m1012, res_reg_0123;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10 = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+
+ // More shuffling
+ src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01 = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12 = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23 = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+ // Partial output
+ res_reg_m10 = _mm_maddubs_epi16(src_reg_m10, kernel_reg_23);
+ res_reg_01 = _mm_maddubs_epi16(src_reg_01, kernel_reg_23);
+
+ res_reg_12 = _mm_maddubs_epi16(src_reg_12, kernel_reg_45);
+ res_reg_23 = _mm_maddubs_epi16(src_reg_23, kernel_reg_45);
+
+ // Add to get entire output
+ res_reg_m1012 = _mm_adds_epi16(res_reg_m10, res_reg_12);
+ res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23);
+
+ // Round the words
+ res_reg_m1012 = mm_round_epi16_sse2(&res_reg_m1012, &reg_32, 6);
+ res_reg_0123 = mm_round_epi16_sse2(&res_reg_0123, &reg_32, 6);
+
+ // Pack from 16-bit to 8-bit
+ res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128());
+ res_reg_0123 = _mm_packus_epi16(res_reg_0123, _mm_setzero_si128());
+
+ _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10 = src_reg_12;
+ src_reg_01 = src_reg_23;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into a single register in the form
+ // k[5:2] k[5:2] k[5:2] k[5:2]
+ // Then we shuffle the source into
+ // s[5:2] s[4:1] s[3:0] s[2:-1]
+ // Calling multiply and add gives us half of the sum next to each other.
+ // Calling horizontal add then gives us the output.
+
+ __m128i kernel_reg; // Kernel
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shuf;
+ __m128i dst_first;
+ __m128i shuf_idx =
+ _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u));
+
+ for (h = height; h > 0; --h) {
+ // Load the source
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shuf = _mm_shuffle_epi8(src_reg, shuf_idx);
+
+ // Get the result
+ dst_first = _mm_maddubs_epi16(src_reg_shuf, kernel_reg);
+ dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128());
+
+ // Round result
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+ // Pack to 8-bits
+ dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[2,0] s[1,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call horizontal add to get the output.
+ // Finally, we can add multiple rows together to get the desired output.
+ // This is done two rows at a time
+
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source.
+ __m128i src_reg_m10, src_reg_01;
+ __m128i src_reg_12, src_reg_23;
+ __m128i src_reg_m1001, src_reg_1223;
+ __m128i src_reg_m1012_1023_lo, src_reg_m1012_1023_hi;
+
+ __m128i kernel_reg; // Kernel
+
+ // Result after multiply and add
+ __m128i reg_0, reg_1;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u));
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10 = _mm_unpacklo_epi32(src_reg_m1, src_reg_0);
+
+ // More shuffling
+ src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01 = _mm_unpacklo_epi32(src_reg_0, src_reg_1);
+
+ // Put three rows next to each other
+ src_reg_m1001 = _mm_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+ src_reg_12 = _mm_unpacklo_epi32(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+ src_reg_23 = _mm_unpacklo_epi32(src_reg_2, src_reg_3);
+
+ // Put three rows next to each other
+ src_reg_1223 = _mm_unpacklo_epi8(src_reg_12, src_reg_23);
+
+ // Put all four rows next to each other
+ src_reg_m1012_1023_lo = _mm_unpacklo_epi16(src_reg_m1001, src_reg_1223);
+ src_reg_m1012_1023_hi = _mm_unpackhi_epi16(src_reg_m1001, src_reg_1223);
+
+ // Get the results
+ reg_0 = _mm_maddubs_epi16(src_reg_m1012_1023_lo, kernel_reg);
+ reg_1 = _mm_maddubs_epi16(src_reg_m1012_1023_hi, kernel_reg);
+ reg_0 = _mm_hadds_epi16(reg_0, _mm_setzero_si128());
+ reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128());
+
+ // Round the words
+ reg_0 = mm_round_epi16_sse2(&reg_0, &reg_32, 6);
+ reg_1 = mm_round_epi16_sse2(&reg_1, &reg_32, 6);
+
+ // Pack from 16-bit to 8-bit and put them in the right order
+ reg_0 = _mm_packus_epi16(reg_0, reg_0);
+ reg_1 = _mm_packus_epi16(reg_1, reg_1);
+
+ // Save the result
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
+ *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001 = src_reg_1223;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+// From vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_filter_block1d16_v4_avg_ssse3 vpx_filter_block1d16_v8_avg_ssse3
+#define vpx_filter_block1d16_h4_avg_ssse3 vpx_filter_block1d16_h8_avg_ssse3
+#define vpx_filter_block1d8_v4_avg_ssse3 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h4_avg_ssse3 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v4_avg_ssse3 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h4_avg_ssse3 vpx_filter_block1d4_h8_avg_ssse3
+
+// From vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+
+// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0)
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+ ssse3, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1)
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1)
+
+static void filter_horiz_w8_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const x_filter) {
+ __m128i s[8], ss[4], temp;
+
+ load_8bit_8x8(src, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ transpose_16bit_4x8(s, ss);
+ temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void transpose8x8_to_dst(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m128i s[8];
+
+ load_8bit_8x8(src, src_stride, s);
+ transpose_8bit_8x8(s, s);
+ store_8bit_8x8(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4,
+ const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = h + (8 - (h & 0x7));
+
+ do {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 8) {
+ // process 8 src_x steps
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
+ } else {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 8x8 filters values back to dst
+ transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
+ }
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static void filter_horiz_w4_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[4], ss[2];
+ __m128i temp;
+
+ load_8bit_8x4(src, src_stride, s);
+ transpose_16bit_4x4(s, ss);
+ // 00 01 10 11 20 21 30 31
+ s[0] = ss[0];
+ // 02 03 12 13 22 23 32 33
+ s[1] = _mm_srli_si128(ss[0], 8);
+ // 04 05 14 15 24 25 34 35
+ s[2] = ss[1];
+ // 06 07 16 17 26 27 36 37
+ s[3] = _mm_srli_si128(ss[1], 8);
+
+ temp = shuffle_filter_convolve8_8_ssse3(s, filter);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 4 bytes
+ *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void transpose4x4_to_dst(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m128i s[4];
+
+ load_8bit_4x4(src, src_stride, s);
+ s[0] = transpose_8bit_4x4(s);
+ s[1] = _mm_srli_si128(s[0], 4);
+ s[2] = _mm_srli_si128(s[0], 8);
+ s[3] = _mm_srli_si128(s[0], 12);
+ store_8bit_4x4(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4,
+ const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; y += 4) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 4) {
+ // process 4 src_x steps
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
+ } else {
+ int i;
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 4x4 filters values back to dst
+ transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
+ }
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ }
+}
+
+static __m128i filter_vert_kernel(const __m128i *const s,
+ const int16_t *const filter) {
+ __m128i ss[4];
+ __m128i temp;
+
+ // 00 10 01 11 02 12 03 13
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ // 20 30 21 31 22 32 23 33
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ // 40 50 41 51 42 52 43 53
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ // 60 70 61 71 62 72 63 73
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+ temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
+ // shrink to 8 bit each 16 bits
+ return _mm_packus_epi16(temp, temp);
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[8];
+ __m128i temp;
+
+ load_8bit_4x8(src, src_stride, s);
+ temp = filter_vert_kernel(s, filter);
+ // save only 4 bytes
+ *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void scaledconvolve_vert_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+
+ y_q4 += y_step_q4;
+ }
+}
+
+static void filter_vert_w8_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[8], temp;
+
+ load_8bit_8x8(src, src_stride, s);
+ temp = filter_vert_kernel(s, filter);
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void scaledconvolve_vert_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+ y_q4 += y_step_q4;
+ }
+}
+
+static void filter_vert_w16_ssse3(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const filter, const int w) {
+ int i;
+ __m128i f[4];
+ shuffle_filter_ssse3(filter, f);
+
+ for (i = 0; i < w; i += 16) {
+ __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
+
+ loadu_8bit_16x8(src, src_stride, s);
+
+ // merge the result together
+ s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
+ s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
+ s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
+ temp_lo = convolve8_8_ssse3(s_lo, f);
+ temp_hi = convolve8_8_ssse3(s_hi, f);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first convolve
+ // result and the second lane contain the second convolve result
+ temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
+ src += 16;
+ // save 16 bytes convolve result
+ _mm_store_si128((__m128i *)&dst[i], temp_hi);
+ }
+}
+
+static void scaledconvolve_vert_w16(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
+ w);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+ y_q4 += y_step_q4;
+ }
+}
+
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if (w >= 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else {
+ scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ }
+}
+
+// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, ssse3, 0)
+FUN_CONV_2D(avg_, ssse3, 1)
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..c8455e13a2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
@@ -0,0 +1,989 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklqdq xmm0, xmm1
+ punpcklqdq xmm2, xmm3
+ punpcklqdq xmm5, xmm4
+ punpcklqdq xmm6, xmm7
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm2
+ movdqa k5k4, xmm5
+ movdqa k6k7, xmm6
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpckldq xmm6, xmm7
+ punpckldq xmm2, xmm3
+ punpckldq xmm5, xmm4
+
+ punpcklbw xmm0, zero ;unpack to word
+ punpcklbw xmm6, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+
+ pmullw xmm0, k0k1 ;multiply the filter factors
+ pmullw xmm6, k6k7
+ pmullw xmm2, k2k3
+ pmullw xmm5, k5k4
+
+ paddsw xmm0, xmm6 ;sum
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm2
+ psrldq xmm2, 8
+ paddsw xmm0, xmm5
+ psrldq xmm5, 8
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm1, xmm1
+ punpcklwd xmm2, xmm2
+ punpcklwd xmm3, xmm3
+ punpckhwd xmm4, xmm4
+ punpckhwd xmm5, xmm5
+ punpckhwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movdqa k0, xmm0 ;store filter factors on stack
+ movdqa k1, xmm1
+ movdqa k2, xmm2
+ movdqa k3, xmm3
+ movdqa k4, xmm4
+ movdqa k5, xmm5
+ movdqa k6, xmm6
+ movdqa k7, xmm7
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+ movq xmm0, [rsi + %1] ;0
+ movq xmm1, [rsi + rax + %1] ;1
+ movq xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2 + %1] ;7
+ movq xmm2, [rsi + rax + %1] ;2
+ movq xmm3, [rsi + rax * 2 + %1] ;3
+ movq xmm4, [rsi + rdx + %1] ;4
+ movq xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+ punpcklbw xmm0, zero
+ punpcklbw xmm1, zero
+ punpcklbw xmm6, zero
+ punpcklbw xmm7, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+ punpcklbw xmm3, zero
+ punpcklbw xmm4, zero
+
+ pmullw xmm0, k0
+ pmullw xmm1, k1
+ pmullw xmm6, k6
+ pmullw xmm7, k7
+ pmullw xmm2, k2
+ pmullw xmm5, k5
+ pmullw xmm3, k3
+ pmullw xmm4, k4
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+ paddsw xmm0, xmm3
+ paddsw xmm0, xmm4
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi + %2]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void vpx_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d4_v8_sse2)
+sym(vpx_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d8_v8_sse2)
+sym(vpx_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d16_v8_sse2)
+sym(vpx_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 0, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_v8_avg_sse2)
+sym(vpx_filter_block1d4_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 1
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_v8_avg_sse2)
+sym(vpx_filter_block1d8_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_v8_avg_sse2)
+sym(vpx_filter_block1d16_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 1, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d4_h8_sse2)
+sym(vpx_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d8_h8_sse2)
+sym(vpx_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d16_h8_sse2)
+sym(vpx_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_h8_avg_sse2)
+sym(vpx_filter_block1d4_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_h8_avg_sse2)
+sym(vpx_filter_block1d8_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_h8_avg_sse2)
+sym(vpx_filter_block1d16_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
new file mode 100644
index 0000000000..fe617f1207
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -0,0 +1,803 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_64: times 8 dw 64
+
+; %define USE_PMULHRSW
+; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
+; when using this instruction.
+;
+; The add order below (based on ffvp9) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
+
+SECTION .text
+%define LOCAL_VARS_SIZE 16*6
+
+%macro SETUP_LOCAL_VARS 0
+ ; TODO(slavarnway): using xmm registers for these on VPX_ARCH_X86_64 +
+ ; pmaddubsw has a higher latency on some platforms, this might be eased by
+ ; interleaving the instructions.
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ packsswb m4, m4
+ ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
+ ; some platforms.
+ pshuflw m0, m4, 0b ;k0_k1
+ pshuflw m1, m4, 01010101b ;k2_k3
+ pshuflw m2, m4, 10101010b ;k4_k5
+ pshuflw m3, m4, 11111111b ;k6_k7
+ punpcklqdq m0, m0
+ punpcklqdq m1, m1
+ punpcklqdq m2, m2
+ punpcklqdq m3, m3
+ mova k0k1, m0
+ mova k2k3, m1
+ mova k4k5, m2
+ mova k6k7, m3
+%if VPX_ARCH_X86_64
+ %define krd m12
+ %define tmp0 [rsp + 16*4]
+ %define tmp1 [rsp + 16*5]
+ mova krd, [GLOBAL(pw_64)]
+%else
+ %define krd [rsp + 16*4]
+%if CONFIG_PIC=0
+ mova m6, [GLOBAL(pw_64)]
+%else
+ ; build constants without accessing global memory
+ pcmpeqb m6, m6 ;all ones
+ psrlw m6, 15
+ psllw m6, 6 ;aka pw_64
+%endif
+ mova krd, m6
+%endif
+%endm
+
+;-------------------------------------------------------------------------------
+%if VPX_ARCH_X86_64
+ %define LOCAL_VARS_SIZE_H4 0
+%else
+ %define LOCAL_VARS_SIZE_H4 16*4
+%endif
+
+%macro SUBPIX_HFILTER4 1
+cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ packsswb m4, m4
+%if VPX_ARCH_X86_64
+ %define k0k1k4k5 m8
+ %define k2k3k6k7 m9
+ %define krd m10
+ mova krd, [GLOBAL(pw_64)]
+ pshuflw k0k1k4k5, m4, 0b ;k0_k1
+ pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
+ pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
+ pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
+%else
+ %define k0k1k4k5 [rsp + 16*0]
+ %define k2k3k6k7 [rsp + 16*1]
+ %define krd [rsp + 16*2]
+ pshuflw m6, m4, 0b ;k0_k1
+ pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
+ pshuflw m7, m4, 01010101b ;k2_k3
+ pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
+%if CONFIG_PIC=0
+ mova m1, [GLOBAL(pw_64)]
+%else
+ ; build constants without accessing global memory
+ pcmpeqb m1, m1 ;all ones
+ psrlw m1, 15
+ psllw m1, 6 ;aka pw_64
+%endif
+ mova k0k1k4k5, m6
+ mova k2k3k6k7, m7
+ mova krd, m1
+%endif
+ dec heightd
+
+.loop:
+ ;Do two rows at once
+ movu m4, [srcq - 3]
+ movu m5, [srcq + sstrideq - 3]
+ punpckhbw m1, m4, m4
+ punpcklbw m4, m4
+ punpckhbw m3, m5, m5
+ punpcklbw m5, m5
+ palignr m0, m1, m4, 1
+ pmaddubsw m0, k0k1k4k5
+ palignr m1, m4, 5
+ pmaddubsw m1, k2k3k6k7
+ palignr m2, m3, m5, 1
+ pmaddubsw m2, k0k1k4k5
+ palignr m3, m5, 5
+ pmaddubsw m3, k2k3k6k7
+ punpckhqdq m4, m0, m2
+ punpcklqdq m0, m2
+ punpckhqdq m5, m1, m3
+ punpcklqdq m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+%ifidn %1, h8_avg
+ movd m4, [dstq]
+ movd m5, [dstq + dstrideq]
+%endif
+ paddsw m0, m1
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
+ psrldq m1, m0, 4
+
+%ifidn %1, h8_avg
+ pavgb m0, m4
+ pavgb m1, m5
+%endif
+ movd [dstq], m0
+ movd [dstq + dstrideq], m1
+
+ lea srcq, [srcq + sstrideq ]
+ prefetcht0 [srcq + 4 * sstrideq - 3]
+ lea srcq, [srcq + sstrideq ]
+ lea dstq, [dstq + 2 * dstrideq ]
+ prefetcht0 [srcq + 2 * sstrideq - 3]
+
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m4, [srcq - 3]
+ punpckhbw m1, m4, m4
+ punpcklbw m4, m4
+ palignr m0, m1, m4, 1
+ palignr m1, m4, 5
+ pmaddubsw m0, k0k1k4k5
+ pmaddubsw m1, k2k3k6k7
+ psrldq m2, m0, 8
+ psrldq m3, m1, 8
+ paddsw m0, m2
+ paddsw m1, m3
+ paddsw m0, m1
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
+%ifidn %1, h8_avg
+ movd m4, [dstq]
+ pavgb m0, m4
+%endif
+ movd [dstq], m0
+.done:
+ REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER8 1
+cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+ dec heightd
+
+.loop:
+ ;Do two rows at once
+ movu m0, [srcq - 3]
+ movu m4, [srcq + sstrideq - 3]
+ punpckhbw m1, m0, m0
+ punpcklbw m0, m0
+ palignr m5, m1, m0, 13
+ pmaddubsw m5, k6k7
+ palignr m2, m1, m0, 5
+ palignr m3, m1, m0, 9
+ palignr m1, m0, 1
+ pmaddubsw m1, k0k1
+ punpckhbw m6, m4, m4
+ punpcklbw m4, m4
+ pmaddubsw m2, k2k3
+ pmaddubsw m3, k4k5
+
+ palignr m7, m6, m4, 13
+ palignr m0, m6, m4, 5
+ pmaddubsw m7, k6k7
+ paddsw m1, m3
+ paddsw m2, m5
+ paddsw m1, m2
+%ifidn %1, h8_avg
+ movh m2, [dstq]
+ movhps m2, [dstq + dstrideq]
+%endif
+ palignr m5, m6, m4, 9
+ palignr m6, m4, 1
+ pmaddubsw m0, k2k3
+ pmaddubsw m6, k0k1
+ paddsw m1, krd
+ pmaddubsw m5, k4k5
+ psraw m1, 7
+ paddsw m0, m7
+ paddsw m6, m5
+ paddsw m6, m0
+ paddsw m6, krd
+ psraw m6, 7
+ packuswb m1, m6
+%ifidn %1, h8_avg
+ pavgb m1, m2
+%endif
+ movh [dstq], m1
+ movhps [dstq + dstrideq], m1
+
+ lea srcq, [srcq + sstrideq ]
+ prefetcht0 [srcq + 4 * sstrideq - 3]
+ lea srcq, [srcq + sstrideq ]
+ lea dstq, [dstq + 2 * dstrideq ]
+ prefetcht0 [srcq + 2 * sstrideq - 3]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m0, [srcq - 3]
+ punpckhbw m3, m0, m0
+ punpcklbw m0, m0
+ palignr m1, m3, m0, 1
+ palignr m2, m3, m0, 5
+ palignr m4, m3, m0, 13
+ palignr m3, m0, 9
+ pmaddubsw m1, k0k1
+ pmaddubsw m2, k2k3
+ pmaddubsw m3, k4k5
+ pmaddubsw m4, k6k7
+ paddsw m1, m3
+ paddsw m4, m2
+ paddsw m1, m4
+ paddsw m1, krd
+ psraw m1, 7
+ packuswb m1, m1
+%ifidn %1, h8_avg
+ movh m0, [dstq]
+ pavgb m1, m0
+%endif
+ movh [dstq], m1
+.done:
+ REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER16 1
+cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+.loop:
+ prefetcht0 [srcq + 2 * sstrideq -3]
+
+ movu m0, [srcq - 3]
+ movu m4, [srcq - 2]
+ pmaddubsw m0, k0k1
+ pmaddubsw m4, k0k1
+ movu m1, [srcq - 1]
+ movu m5, [srcq + 0]
+ pmaddubsw m1, k2k3
+ pmaddubsw m5, k2k3
+ movu m2, [srcq + 1]
+ movu m6, [srcq + 2]
+ pmaddubsw m2, k4k5
+ pmaddubsw m6, k4k5
+ movu m3, [srcq + 3]
+ movu m7, [srcq + 4]
+ pmaddubsw m3, k6k7
+ pmaddubsw m7, k6k7
+ paddsw m0, m2
+ paddsw m1, m3
+ paddsw m0, m1
+ paddsw m4, m6
+ paddsw m5, m7
+ paddsw m4, m5
+ paddsw m0, krd
+ paddsw m4, krd
+ psraw m0, 7
+ psraw m4, 7
+ packuswb m0, m0
+ packuswb m4, m4
+ punpcklbw m0, m4
+%ifidn %1, h8_avg
+ pavgb m0, [dstq]
+%endif
+ lea srcq, [srcq + sstrideq]
+ mova [dstq], m0
+ lea dstq, [dstq + dstrideq]
+ dec heightd
+ jnz .loop
+ REP_RET
+%endm
+
+INIT_XMM ssse3
+SUBPIX_HFILTER16 h8 ; vpx_filter_block1d16_h8_ssse3
+SUBPIX_HFILTER16 h8_avg ; vpx_filter_block1d16_h8_avg_ssse3
+SUBPIX_HFILTER8 h8 ; vpx_filter_block1d8_h8_ssse3
+SUBPIX_HFILTER8 h8_avg ; vpx_filter_block1d8_h8_avg_ssse3
+SUBPIX_HFILTER4 h8 ; vpx_filter_block1d4_h8_ssse3
+SUBPIX_HFILTER4 h8_avg ; vpx_filter_block1d4_h8_avg_ssse3
+
+;-------------------------------------------------------------------------------
+
+; TODO(Linfeng): Detect cpu type and choose the code with better performance.
+%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
+
+%if VPX_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+ %define NUM_GENERAL_REG_USED 9
+%else
+ %define NUM_GENERAL_REG_USED 6
+%endif
+
+%macro SUBPIX_VFILTER 2
+cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+%ifidn %2, 8
+ %define movx movh
+%else
+ %define movx movd
+%endif
+
+ dec heightd
+
+%if VPX_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if VPX_ARCH_X86_64
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
+%else
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
+%endif
+ mov src1q, srcq
+ add src1q, sstrideq
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
+
+.loop:
+ ;Do two rows at once
+ movx m0, [srcq ] ;A
+ movx m1, [src1q ] ;B
+ punpcklbw m0, m1 ;A B
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ pmaddubsw m0, k0k1
+ mova m6, m2
+ movx m3, [src1q + sstrideq * 2] ;D
+ punpcklbw m2, m3 ;C D
+ pmaddubsw m2, k2k3
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ mova m7, m4
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m4, k4k5
+ punpcklbw m1, m6 ;A B next iter
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m5, m6 ;E F next iter
+ punpcklbw m3, m7 ;C D next iter
+ pmaddubsw m5, k4k5
+ movx m7, [src1q + sstride6q ] ;H
+ punpcklbw m6, m7 ;G H
+ pmaddubsw m6, k6k7
+ pmaddubsw m3, k2k3
+ pmaddubsw m1, k0k1
+ paddsw m0, m4
+ paddsw m2, m6
+ movx m6, [srcq + sstrideq * 8 ] ;H next iter
+ punpcklbw m7, m6
+ pmaddubsw m7, k6k7
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ paddsw m1, m5
+ packuswb m0, m0
+
+ paddsw m3, m7
+ paddsw m1, m3
+ paddsw m1, krd
+ psraw m1, 7
+ lea srcq, [srcq + sstrideq * 2 ]
+ lea src1q, [src1q + sstrideq * 2]
+ packuswb m1, m1
+
+%ifidn %1, v8_avg
+ movx m2, [dstq]
+ pavgb m0, m2
+%endif
+ movx [dstq], m0
+ add dstq, dst_stride
+%ifidn %1, v8_avg
+ movx m3, [dstq]
+ pavgb m1, m3
+%endif
+ movx [dstq], m1
+ add dstq, dst_stride
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m0, m1 ;A B
+ movx m7, [src1q + sstride6q ] ;H
+ pmaddubsw m0, k0k1
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ punpcklbw m6, m7 ;G H
+ movx m3, [src1q + sstrideq * 2] ;D
+ pmaddubsw m6, k6k7
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ punpcklbw m2, m3 ;C D
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m2, k2k3
+ pmaddubsw m4, k4k5
+ paddsw m2, m6
+ paddsw m0, m4
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
+%ifidn %1, v8_avg
+ movx m1, [dstq]
+ pavgb m0, m1
+%endif
+ movx [dstq], m0
+
+%else
+ ; VPX_ARCH_X86_64
+
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m2, [srcq] ;C
+ movx m3, [srcq + sstrideq] ;D
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m4, [srcq] ;E
+ movx m5, [srcq + sstrideq] ;F
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m6, [srcq] ;G
+ punpcklbw m0, m1 ;A B
+ punpcklbw m1, m2 ;A B next iter
+ punpcklbw m2, m3 ;C D
+ punpcklbw m3, m4 ;C D next iter
+ punpcklbw m4, m5 ;E F
+ punpcklbw m5, m6 ;E F next iter
+
+.loop:
+ ;Do two rows at once
+ movx m7, [srcq + sstrideq] ;H
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m14, [srcq] ;H next iter
+ punpcklbw m6, m7 ;G H
+ punpcklbw m7, m14 ;G H next iter
+ pmaddubsw m8, m0, k0k1
+ pmaddubsw m9, m1, k0k1
+ mova m0, m2
+ mova m1, m3
+ pmaddubsw m10, m2, k2k3
+ pmaddubsw m11, m3, k2k3
+ mova m2, m4
+ mova m3, m5
+ pmaddubsw m4, k4k5
+ pmaddubsw m5, k4k5
+ paddsw m8, m4
+ paddsw m9, m5
+ mova m4, m6
+ mova m5, m7
+ pmaddubsw m6, k6k7
+ pmaddubsw m7, k6k7
+ paddsw m10, m6
+ paddsw m11, m7
+ paddsw m8, m10
+ paddsw m9, m11
+ mova m6, m14
+ paddsw m8, krd
+ paddsw m9, krd
+ psraw m8, 7
+ psraw m9, 7
+%ifidn %2, 4
+ packuswb m8, m8
+ packuswb m9, m9
+%else
+ packuswb m8, m9
+%endif
+
+%ifidn %1, v8_avg
+ movx m7, [dstq]
+%ifidn %2, 4
+ movx m10, [dstq + dstrideq]
+ pavgb m9, m10
+%else
+ movhpd m7, [dstq + dstrideq]
+%endif
+ pavgb m8, m7
+%endif
+ movx [dstq], m8
+%ifidn %2, 4
+ movx [dstq + dstrideq], m9
+%else
+ movhpd [dstq + dstrideq], m8
+%endif
+
+ lea dstq, [dstq + dstrideq * 2 ]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movx m7, [srcq + sstrideq] ;H
+ punpcklbw m6, m7 ;G H
+ pmaddubsw m0, k0k1
+ pmaddubsw m2, k2k3
+ pmaddubsw m4, k4k5
+ pmaddubsw m6, k6k7
+ paddsw m0, m4
+ paddsw m2, m6
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
+%ifidn %1, v8_avg
+ movx m1, [dstq]
+ pavgb m0, m1
+%endif
+ movx [dstq], m0
+
+%endif ; VPX_ARCH_X86_64
+
+.done:
+ REP_RET
+
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER16 1
+cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+%if VPX_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if VPX_ARCH_X86_64
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
+%else
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
+%endif
+ lea src1q, [srcq + sstrideq]
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
+
+.loop:
+ movh m0, [srcq ] ;A
+ movh m1, [src1q ] ;B
+ movh m2, [srcq + sstrideq * 2 ] ;C
+ movh m3, [src1q + sstrideq * 2] ;D
+ movh m4, [srcq + sstrideq * 4 ] ;E
+ movh m5, [src1q + sstrideq * 4] ;F
+
+ punpcklbw m0, m1 ;A B
+ movh m6, [srcq + sstride6q] ;G
+ punpcklbw m2, m3 ;C D
+ movh m7, [src1q + sstride6q] ;H
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m0, k0k1
+ movh m3, [srcq + 8] ;A
+ pmaddubsw m2, k2k3
+ punpcklbw m6, m7 ;G H
+ movh m5, [srcq + sstrideq + 8] ;B
+ pmaddubsw m4, k4k5
+ punpcklbw m3, m5 ;A B
+ movh m7, [srcq + sstrideq * 2 + 8] ;C
+ pmaddubsw m6, k6k7
+ movh m5, [src1q + sstrideq * 2 + 8] ;D
+ punpcklbw m7, m5 ;C D
+ paddsw m2, m6
+ pmaddubsw m3, k0k1
+ movh m1, [srcq + sstrideq * 4 + 8] ;E
+ paddsw m0, m4
+ pmaddubsw m7, k2k3
+ movh m6, [src1q + sstrideq * 4 + 8] ;F
+ punpcklbw m1, m6 ;E F
+ paddsw m0, m2
+ paddsw m0, krd
+ movh m2, [srcq + sstride6q + 8] ;G
+ pmaddubsw m1, k4k5
+ movh m5, [src1q + sstride6q + 8] ;H
+ psraw m0, 7
+ punpcklbw m2, m5 ;G H
+ pmaddubsw m2, k6k7
+ paddsw m7, m2
+ paddsw m3, m1
+ paddsw m3, m7
+ paddsw m3, krd
+ psraw m3, 7
+ packuswb m0, m3
+
+ add srcq, sstrideq
+ add src1q, sstrideq
+%ifidn %1, v8_avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dst_stride
+ dec heightd
+ jnz .loop
+ REP_RET
+
+%else
+ ; VPX_ARCH_X86_64
+ dec heightd
+
+ movu m1, [srcq ] ;A
+ movu m3, [srcq + sstrideq ] ;B
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m0, m1, m3 ;A B
+ punpckhbw m1, m3 ;A B
+ movu m5, [srcq] ;C
+ punpcklbw m2, m3, m5 ;A B next iter
+ punpckhbw m3, m5 ;A B next iter
+ mova tmp0, m2 ;store to stack
+ mova tmp1, m3 ;store to stack
+ movu m7, [srcq + sstrideq] ;D
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m4, m5, m7 ;C D
+ punpckhbw m5, m7 ;C D
+ movu m9, [srcq] ;E
+ punpcklbw m6, m7, m9 ;C D next iter
+ punpckhbw m7, m9 ;C D next iter
+ movu m11, [srcq + sstrideq] ;F
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m8, m9, m11 ;E F
+ punpckhbw m9, m11 ;E F
+ movu m2, [srcq] ;G
+ punpcklbw m10, m11, m2 ;E F next iter
+ punpckhbw m11, m2 ;E F next iter
+
+.loop:
+ ;Do two rows at once
+ pmaddubsw m13, m0, k0k1
+ mova m0, m4
+ pmaddubsw m14, m8, k4k5
+ pmaddubsw m15, m4, k2k3
+ mova m4, m8
+ paddsw m13, m14
+ movu m3, [srcq + sstrideq] ;H
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m14, m2, m3 ;G H
+ mova m8, m14
+ pmaddubsw m14, k6k7
+ paddsw m15, m14
+ paddsw m13, m15
+ paddsw m13, krd
+ psraw m13, 7
+
+ pmaddubsw m14, m1, k0k1
+ pmaddubsw m1, m9, k4k5
+ pmaddubsw m15, m5, k2k3
+ paddsw m14, m1
+ mova m1, m5
+ mova m5, m9
+ punpckhbw m2, m3 ;G H
+ mova m9, m2
+ pmaddubsw m2, k6k7
+ paddsw m15, m2
+ paddsw m14, m15
+ paddsw m14, krd
+ psraw m14, 7
+ packuswb m13, m14
+%ifidn %1, v8_avg
+ pavgb m13, [dstq]
+%endif
+ mova [dstq], m13
+
+ ; next iter
+ pmaddubsw m15, tmp0, k0k1
+ pmaddubsw m14, m10, k4k5
+ pmaddubsw m13, m6, k2k3
+ paddsw m15, m14
+ mova tmp0, m6
+ mova m6, m10
+ movu m2, [srcq] ;G next iter
+ punpcklbw m14, m3, m2 ;G H next iter
+ mova m10, m14
+ pmaddubsw m14, k6k7
+ paddsw m13, m14
+ paddsw m15, m13
+ paddsw m15, krd
+ psraw m15, 7
+
+ pmaddubsw m14, tmp1, k0k1
+ mova tmp1, m7
+ pmaddubsw m13, m7, k2k3
+ mova m7, m11
+ pmaddubsw m11, k4k5
+ paddsw m14, m11
+ punpckhbw m3, m2 ;G H next iter
+ mova m11, m3
+ pmaddubsw m3, k6k7
+ paddsw m13, m3
+ paddsw m14, m13
+ paddsw m14, krd
+ psraw m14, 7
+ packuswb m15, m14
+%ifidn %1, v8_avg
+ pavgb m15, [dstq + dstrideq]
+%endif
+ mova [dstq + dstrideq], m15
+ lea dstq, [dstq + dstrideq * 2]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m3, [srcq + sstrideq] ;H
+ punpcklbw m6, m2, m3 ;G H
+ punpckhbw m2, m3 ;G H
+ pmaddubsw m0, k0k1
+ pmaddubsw m1, k0k1
+ pmaddubsw m4, k2k3
+ pmaddubsw m5, k2k3
+ pmaddubsw m8, k4k5
+ pmaddubsw m9, k4k5
+ pmaddubsw m6, k6k7
+ pmaddubsw m2, k6k7
+ paddsw m0, m8
+ paddsw m1, m9
+ paddsw m4, m6
+ paddsw m5, m2
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m0, krd
+ paddsw m1, krd
+ psraw m0, 7
+ psraw m1, 7
+ packuswb m0, m1
+%ifidn %1, v8_avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+
+.done:
+ REP_RET
+
+%endif ; VPX_ARCH_X86_64
+
+%endm
+
+INIT_XMM ssse3
+SUBPIX_VFILTER16 v8 ; vpx_filter_block1d16_v8_ssse3
+SUBPIX_VFILTER16 v8_avg ; vpx_filter_block1d16_v8_avg_ssse3
+SUBPIX_VFILTER v8, 8 ; vpx_filter_block1d8_v8_ssse3
+SUBPIX_VFILTER v8_avg, 8 ; vpx_filter_block1d8_v8_avg_ssse3
+SUBPIX_VFILTER v8, 4 ; vpx_filter_block1d4_v8_ssse3
+SUBPIX_VFILTER v8_avg, 4 ; vpx_filter_block1d4_v8_avg_ssse3
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..65790b1c21
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
@@ -0,0 +1,450 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklqdq xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ pxor xmm2, xmm2
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpcklbw xmm0, xmm2 ;unpack to word
+ pmullw xmm0, xmm4 ;multiply the filter factors
+
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+
+ paddsw xmm0, xmm3 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+
+ pshuflw xmm6, xmm7, 11111111b ;k3
+ pshufhw xmm7, xmm7, 0b ;k4
+ punpcklwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ pxor xmm5, xmm5
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm4 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ pmullw xmm2, xmm6
+ pmullw xmm3, xmm7
+
+ paddsw xmm0, xmm1
+ paddsw xmm2, xmm3
+
+ paddsw xmm0, xmm4 ;rounding
+ paddsw xmm2, xmm4
+ psraw xmm0, 7 ;shift
+ psraw xmm2, 7
+ packuswb xmm0, xmm2 ;pack back to byte
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+SECTION .text
+
+globalsym(vpx_filter_block1d4_v2_sse2)
+sym(vpx_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_v2_sse2)
+sym(vpx_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_v2_sse2)
+sym(vpx_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_v2_avg_sse2)
+sym(vpx_filter_block1d4_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_v2_avg_sse2)
+sym(vpx_filter_block1d8_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_v2_avg_sse2)
+sym(vpx_filter_block1d16_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_h2_sse2)
+sym(vpx_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_h2_sse2)
+sym(vpx_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_h2_sse2)
+sym(vpx_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_h2_avg_sse2)
+sym(vpx_filter_block1d4_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_h2_avg_sse2)
+sym(vpx_filter_block1d8_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_h2_avg_sse2)
+sym(vpx_filter_block1d16_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
new file mode 100644
index 0000000000..32e3cd3d9f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,420 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov ecx, 0x01000100
+
+ movdqa xmm3, [rdx] ;load filters
+ psrldq xmm3, 6
+ packsswb xmm3, xmm3
+ pshuflw xmm3, xmm3, 0b ;k3_k4
+
+ movd xmm2, ecx ;rounding_shift
+ pshufd xmm2, xmm2, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm3
+
+ pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7)
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov ecx, 0x01000100
+
+ movdqa xmm7, [rdx] ;load filters
+ psrldq xmm7, 6
+ packsswb xmm7, xmm7
+ pshuflw xmm7, xmm7, 0b ;k3_k4
+ punpcklwd xmm7, xmm7
+
+ movd xmm6, ecx ;rounding_shift
+ pshufd xmm6, xmm6, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm7
+
+ pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
+ packuswb xmm0, xmm0 ;pack back to byte
+
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm2, xmm1
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm2, xmm7
+
+ pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
+ pmulhrsw xmm2, xmm6
+ packuswb xmm0, xmm2 ;pack back to byte
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+SECTION .text
+
+globalsym(vpx_filter_block1d4_v2_ssse3)
+sym(vpx_filter_block1d4_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_v2_ssse3)
+sym(vpx_filter_block1d8_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_v2_ssse3)
+sym(vpx_filter_block1d16_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_v2_avg_ssse3)
+sym(vpx_filter_block1d4_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_v2_avg_ssse3)
+sym(vpx_filter_block1d8_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_v2_avg_ssse3)
+sym(vpx_filter_block1d16_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_h2_ssse3)
+sym(vpx_filter_block1d4_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_h2_ssse3)
+sym(vpx_filter_block1d8_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_h2_ssse3)
+sym(vpx_filter_block1d16_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_h2_avg_ssse3)
+sym(vpx_filter_block1d4_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_h2_avg_ssse3)
+sym(vpx_filter_block1d8_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_h2_avg_ssse3)
+sym(vpx_filter_block1d16_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret